Compare commits
5 Commits
pull-84
...
proxy-supp
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
3caf48c9be | ||
|
|
2ba70b9501 | ||
|
|
16f98cebc0 | ||
|
|
fe9ff498ce | ||
|
|
eba831ca30 |
27
README.md
27
README.md
@@ -190,6 +190,33 @@ result = crawler.run(
|
|||||||
print(result.extracted_content)
|
print(result.extracted_content)
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### Extract Structured Data from Web Pages With Proxy and BaseUrl
|
||||||
|
|
||||||
|
```python
|
||||||
|
from crawl4ai import WebCrawler
|
||||||
|
from crawl4ai.extraction_strategy import LLMExtractionStrategy
|
||||||
|
|
||||||
|
def create_crawler():
|
||||||
|
crawler = WebCrawler(verbose=True, proxy="http://127.0.0.1:7890")
|
||||||
|
crawler.warmup()
|
||||||
|
return crawler
|
||||||
|
|
||||||
|
crawler = create_crawler()
|
||||||
|
|
||||||
|
crawler.warmup()
|
||||||
|
|
||||||
|
result = crawler.run(
|
||||||
|
url="https://www.nbcnews.com/business",
|
||||||
|
extraction_strategy=LLMExtractionStrategy(
|
||||||
|
provider="openai/gpt-4o",
|
||||||
|
api_token="sk-",
|
||||||
|
base_url="https://api.openai.com/v1"
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
print(result.markdown)
|
||||||
|
```
|
||||||
|
|
||||||
## Documentation 📚
|
## Documentation 📚
|
||||||
|
|
||||||
For detailed documentation, including installation instructions, advanced features, and API reference, visit our [Documentation Website](https://crawl4ai.com/mkdocs/).
|
For detailed documentation, including installation instructions, advanced features, and API reference, visit our [Documentation Website](https://crawl4ai.com/mkdocs/).
|
||||||
|
|||||||
@@ -82,6 +82,8 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
|
|||||||
print("[LOG] 🚀 Initializing LocalSeleniumCrawlerStrategy")
|
print("[LOG] 🚀 Initializing LocalSeleniumCrawlerStrategy")
|
||||||
self.options = Options()
|
self.options = Options()
|
||||||
self.options.headless = True
|
self.options.headless = True
|
||||||
|
if kwargs.get("proxy"):
|
||||||
|
self.options.add_argument("--proxy-server={}".format(kwargs.get("proxy")))
|
||||||
if kwargs.get("user_agent"):
|
if kwargs.get("user_agent"):
|
||||||
self.options.add_argument("--user-agent=" + kwargs.get("user_agent"))
|
self.options.add_argument("--user-agent=" + kwargs.get("user_agent"))
|
||||||
else:
|
else:
|
||||||
@@ -242,6 +244,7 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
|
|||||||
driver.quit()
|
driver.quit()
|
||||||
|
|
||||||
# Execute JS code if provided
|
# Execute JS code if provided
|
||||||
|
self.js_code = kwargs.get("js_code", self.js_code)
|
||||||
if self.js_code and type(self.js_code) == str:
|
if self.js_code and type(self.js_code) == str:
|
||||||
self.driver.execute_script(self.js_code)
|
self.driver.execute_script(self.js_code)
|
||||||
# Optionally, wait for some condition after executing the JS code
|
# Optionally, wait for some condition after executing the JS code
|
||||||
|
|||||||
@@ -79,6 +79,7 @@ class LLMExtractionStrategy(ExtractionStrategy):
|
|||||||
self.overlap_rate = kwargs.get("overlap_rate", OVERLAP_RATE)
|
self.overlap_rate = kwargs.get("overlap_rate", OVERLAP_RATE)
|
||||||
self.word_token_rate = kwargs.get("word_token_rate", WORD_TOKEN_RATE)
|
self.word_token_rate = kwargs.get("word_token_rate", WORD_TOKEN_RATE)
|
||||||
self.apply_chunking = kwargs.get("apply_chunking", True)
|
self.apply_chunking = kwargs.get("apply_chunking", True)
|
||||||
|
self.base_url = kwargs.get("base_url", None)
|
||||||
if not self.apply_chunking:
|
if not self.apply_chunking:
|
||||||
self.chunk_token_threshold = 1e9
|
self.chunk_token_threshold = 1e9
|
||||||
|
|
||||||
@@ -110,7 +111,7 @@ class LLMExtractionStrategy(ExtractionStrategy):
|
|||||||
"{" + variable + "}", variable_values[variable]
|
"{" + variable + "}", variable_values[variable]
|
||||||
)
|
)
|
||||||
|
|
||||||
response = perform_completion_with_backoff(self.provider, prompt_with_variables, self.api_token) # , json_response=self.extract_type == "schema")
|
response = perform_completion_with_backoff(self.provider, prompt_with_variables, self.api_token, base_url=self.base_url) # , json_response=self.extract_type == "schema")
|
||||||
try:
|
try:
|
||||||
blocks = extract_xml_data(["blocks"], response.choices[0].message.content)['blocks']
|
blocks = extract_xml_data(["blocks"], response.choices[0].message.content)['blocks']
|
||||||
blocks = json.loads(blocks)
|
blocks = json.loads(blocks)
|
||||||
|
|||||||
@@ -29,7 +29,7 @@ To generate the JSON objects:
|
|||||||
|
|
||||||
5. Make sure the generated JSON is complete and parsable, with no errors or omissions.
|
5. Make sure the generated JSON is complete and parsable, with no errors or omissions.
|
||||||
|
|
||||||
6. Make sur to escape any special characters in the HTML content, and also single or double quote to avoid JSON parsing issues.
|
6. Make sure to escape any special characters in the HTML content, and also single or double quote to avoid JSON parsing issues.
|
||||||
|
|
||||||
Please provide your output within <blocks> tags, like this:
|
Please provide your output within <blocks> tags, like this:
|
||||||
|
|
||||||
@@ -87,7 +87,7 @@ To generate the JSON objects:
|
|||||||
|
|
||||||
5. Make sure the generated JSON is complete and parsable, with no errors or omissions.
|
5. Make sure the generated JSON is complete and parsable, with no errors or omissions.
|
||||||
|
|
||||||
6. Make sur to escape any special characters in the HTML content, and also single or double quote to avoid JSON parsing issues.
|
6. Make sure to escape any special characters in the HTML content, and also single or double quote to avoid JSON parsing issues.
|
||||||
|
|
||||||
7. Never alter the extracted content, just copy and paste it as it is.
|
7. Never alter the extracted content, just copy and paste it as it is.
|
||||||
|
|
||||||
@@ -142,7 +142,7 @@ To generate the JSON objects:
|
|||||||
|
|
||||||
5. Make sure the generated JSON is complete and parsable, with no errors or omissions.
|
5. Make sure the generated JSON is complete and parsable, with no errors or omissions.
|
||||||
|
|
||||||
6. Make sur to escape any special characters in the HTML content, and also single or double quote to avoid JSON parsing issues.
|
6. Make sure to escape any special characters in the HTML content, and also single or double quote to avoid JSON parsing issues.
|
||||||
|
|
||||||
7. Never alter the extracted content, just copy and paste it as it is.
|
7. Never alter the extracted content, just copy and paste it as it is.
|
||||||
|
|
||||||
@@ -201,4 +201,4 @@ Avoid Common Mistakes:
|
|||||||
- Do not generate the Python coee show me how to do the task, this is your task to extract the information and return it in JSON format.
|
- Do not generate the Python coee show me how to do the task, this is your task to extract the information and return it in JSON format.
|
||||||
|
|
||||||
Result
|
Result
|
||||||
Output the final list of JSON objects, wrapped in <blocks>...</blocks> XML tags. Make sure to close the tag properly."""
|
Output the final list of JSON objects, wrapped in <blocks>...</blocks> XML tags. Make sure to close the tag properly."""
|
||||||
|
|||||||
@@ -634,7 +634,12 @@ def get_content_of_website_optimized(url: str, html: str, word_count_threshold:
|
|||||||
return node
|
return node
|
||||||
|
|
||||||
body = flatten_nested_elements(body)
|
body = flatten_nested_elements(body)
|
||||||
|
base64_pattern = re.compile(r'data:image/[^;]+;base64,([^"]+)')
|
||||||
|
for img in imgs:
|
||||||
|
src = img.get('src', '')
|
||||||
|
if base64_pattern.match(src):
|
||||||
|
# Replace base64 data with empty string
|
||||||
|
img['src'] = base64_pattern.sub('', src)
|
||||||
cleaned_html = str(body).replace('\n\n', '\n').replace(' ', ' ')
|
cleaned_html = str(body).replace('\n\n', '\n').replace(' ', ' ')
|
||||||
cleaned_html = sanitize_html(cleaned_html)
|
cleaned_html = sanitize_html(cleaned_html)
|
||||||
|
|
||||||
@@ -716,7 +721,7 @@ def extract_xml_data(tags, string):
|
|||||||
return data
|
return data
|
||||||
|
|
||||||
# Function to perform the completion with exponential backoff
|
# Function to perform the completion with exponential backoff
|
||||||
def perform_completion_with_backoff(provider, prompt_with_variables, api_token, json_response = False):
|
def perform_completion_with_backoff(provider, prompt_with_variables, api_token, json_response = False, base_url=None):
|
||||||
from litellm import completion
|
from litellm import completion
|
||||||
from litellm.exceptions import RateLimitError
|
from litellm.exceptions import RateLimitError
|
||||||
max_attempts = 3
|
max_attempts = 3
|
||||||
@@ -735,6 +740,7 @@ def perform_completion_with_backoff(provider, prompt_with_variables, api_token,
|
|||||||
],
|
],
|
||||||
temperature=0.01,
|
temperature=0.01,
|
||||||
api_key=api_token,
|
api_key=api_token,
|
||||||
|
base_url=base_url,
|
||||||
**extra_args
|
**extra_args
|
||||||
)
|
)
|
||||||
return response # Return the successful response
|
return response # Return the successful response
|
||||||
@@ -755,7 +761,7 @@ def perform_completion_with_backoff(provider, prompt_with_variables, api_token,
|
|||||||
"content": ["Rate limit error. Please try again later."]
|
"content": ["Rate limit error. Please try again later."]
|
||||||
}]
|
}]
|
||||||
|
|
||||||
def extract_blocks(url, html, provider = DEFAULT_PROVIDER, api_token = None):
|
def extract_blocks(url, html, provider = DEFAULT_PROVIDER, api_token = None, base_url = None):
|
||||||
# api_token = os.getenv('GROQ_API_KEY', None) if not api_token else api_token
|
# api_token = os.getenv('GROQ_API_KEY', None) if not api_token else api_token
|
||||||
api_token = PROVIDER_MODELS.get(provider, None) if not api_token else api_token
|
api_token = PROVIDER_MODELS.get(provider, None) if not api_token else api_token
|
||||||
|
|
||||||
@@ -770,7 +776,7 @@ def extract_blocks(url, html, provider = DEFAULT_PROVIDER, api_token = None):
|
|||||||
"{" + variable + "}", variable_values[variable]
|
"{" + variable + "}", variable_values[variable]
|
||||||
)
|
)
|
||||||
|
|
||||||
response = perform_completion_with_backoff(provider, prompt_with_variables, api_token)
|
response = perform_completion_with_backoff(provider, prompt_with_variables, api_token, base_url=base_url)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
blocks = extract_xml_data(["blocks"], response.choices[0].message.content)['blocks']
|
blocks = extract_xml_data(["blocks"], response.choices[0].message.content)['blocks']
|
||||||
@@ -864,17 +870,17 @@ def merge_chunks_based_on_token_threshold(chunks, token_threshold):
|
|||||||
|
|
||||||
return merged_sections
|
return merged_sections
|
||||||
|
|
||||||
def process_sections(url: str, sections: list, provider: str, api_token: str) -> list:
|
def process_sections(url: str, sections: list, provider: str, api_token: str, base_url=None) -> list:
|
||||||
extracted_content = []
|
extracted_content = []
|
||||||
if provider.startswith("groq/"):
|
if provider.startswith("groq/"):
|
||||||
# Sequential processing with a delay
|
# Sequential processing with a delay
|
||||||
for section in sections:
|
for section in sections:
|
||||||
extracted_content.extend(extract_blocks(url, section, provider, api_token))
|
extracted_content.extend(extract_blocks(url, section, provider, api_token, base_url=base_url))
|
||||||
time.sleep(0.5) # 500 ms delay between each processing
|
time.sleep(0.5) # 500 ms delay between each processing
|
||||||
else:
|
else:
|
||||||
# Parallel processing using ThreadPoolExecutor
|
# Parallel processing using ThreadPoolExecutor
|
||||||
with ThreadPoolExecutor() as executor:
|
with ThreadPoolExecutor() as executor:
|
||||||
futures = [executor.submit(extract_blocks, url, section, provider, api_token) for section in sections]
|
futures = [executor.submit(extract_blocks, url, section, provider, api_token, base_url=base_url) for section in sections]
|
||||||
for future in as_completed(futures):
|
for future in as_completed(futures):
|
||||||
extracted_content.extend(future.result())
|
extracted_content.extend(future.result())
|
||||||
|
|
||||||
|
|||||||
@@ -22,9 +22,10 @@ class WebCrawler:
|
|||||||
crawler_strategy: CrawlerStrategy = None,
|
crawler_strategy: CrawlerStrategy = None,
|
||||||
always_by_pass_cache: bool = False,
|
always_by_pass_cache: bool = False,
|
||||||
verbose: bool = False,
|
verbose: bool = False,
|
||||||
|
proxy: str = None,
|
||||||
):
|
):
|
||||||
# self.db_path = db_path
|
# self.db_path = db_path
|
||||||
self.crawler_strategy = crawler_strategy or LocalSeleniumCrawlerStrategy(verbose=verbose)
|
self.crawler_strategy = crawler_strategy or LocalSeleniumCrawlerStrategy(verbose=verbose, proxy=proxy)
|
||||||
self.always_by_pass_cache = always_by_pass_cache
|
self.always_by_pass_cache = always_by_pass_cache
|
||||||
|
|
||||||
# Create the .crawl4ai folder in the user's home directory if it doesn't exist
|
# Create the .crawl4ai folder in the user's home directory if it doesn't exist
|
||||||
|
|||||||
Reference in New Issue
Block a user