Merge branch 'proxy-support' into staging

This commit is contained in:
unclecode
2024-09-01 16:35:14 +08:00
6 changed files with 51 additions and 13 deletions

View File

@@ -190,6 +190,33 @@ result = crawler.run(
print(result.extracted_content)
```
### Extract Structured Data from Web Pages With Proxy and BaseUrl
```python
from crawl4ai import WebCrawler
from crawl4ai.extraction_strategy import LLMExtractionStrategy
def create_crawler():
crawler = WebCrawler(verbose=True, proxy="http://127.0.0.1:7890")
crawler.warmup()
return crawler
crawler = create_crawler()
crawler.warmup()
result = crawler.run(
url="https://www.nbcnews.com/business",
extraction_strategy=LLMExtractionStrategy(
provider="openai/gpt-4o",
api_token="sk-",
base_url="https://api.openai.com/v1"
)
)
print(result.markdown)
```
## Documentation 📚
For detailed documentation, including installation instructions, advanced features, and API reference, visit our [Documentation Website](https://crawl4ai.com/mkdocs/).

View File

@@ -82,6 +82,8 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
print("[LOG] 🚀 Initializing LocalSeleniumCrawlerStrategy")
self.options = Options()
self.options.headless = True
if kwargs.get("proxy"):
self.options.add_argument("--proxy-server={}".format(kwargs.get("proxy")))
if kwargs.get("user_agent"):
self.options.add_argument("--user-agent=" + kwargs.get("user_agent"))
else:
@@ -242,6 +244,7 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
driver.quit()
# Execute JS code if provided
self.js_code = kwargs.get("js_code", self.js_code)
if self.js_code and type(self.js_code) == str:
self.driver.execute_script(self.js_code)
# Optionally, wait for some condition after executing the JS code

View File

@@ -79,6 +79,7 @@ class LLMExtractionStrategy(ExtractionStrategy):
self.overlap_rate = kwargs.get("overlap_rate", OVERLAP_RATE)
self.word_token_rate = kwargs.get("word_token_rate", WORD_TOKEN_RATE)
self.apply_chunking = kwargs.get("apply_chunking", True)
self.base_url = kwargs.get("base_url", None)
if not self.apply_chunking:
self.chunk_token_threshold = 1e9
@@ -110,7 +111,7 @@ class LLMExtractionStrategy(ExtractionStrategy):
"{" + variable + "}", variable_values[variable]
)
response = perform_completion_with_backoff(self.provider, prompt_with_variables, self.api_token) # , json_response=self.extract_type == "schema")
response = perform_completion_with_backoff(self.provider, prompt_with_variables, self.api_token, base_url=self.base_url) # , json_response=self.extract_type == "schema")
try:
blocks = extract_xml_data(["blocks"], response.choices[0].message.content)['blocks']
blocks = json.loads(blocks)

View File

@@ -29,7 +29,7 @@ To generate the JSON objects:
5. Make sure the generated JSON is complete and parsable, with no errors or omissions.
6. Make sur to escape any special characters in the HTML content, and also single or double quote to avoid JSON parsing issues.
6. Make sure to escape any special characters in the HTML content, and also single or double quote to avoid JSON parsing issues.
Please provide your output within <blocks> tags, like this:
@@ -87,7 +87,7 @@ To generate the JSON objects:
5. Make sure the generated JSON is complete and parsable, with no errors or omissions.
6. Make sur to escape any special characters in the HTML content, and also single or double quote to avoid JSON parsing issues.
6. Make sure to escape any special characters in the HTML content, and also single or double quote to avoid JSON parsing issues.
7. Never alter the extracted content, just copy and paste it as it is.
@@ -142,7 +142,7 @@ To generate the JSON objects:
5. Make sure the generated JSON is complete and parsable, with no errors or omissions.
6. Make sur to escape any special characters in the HTML content, and also single or double quote to avoid JSON parsing issues.
6. Make sure to escape any special characters in the HTML content, and also single or double quote to avoid JSON parsing issues.
7. Never alter the extracted content, just copy and paste it as it is.
@@ -201,4 +201,4 @@ Avoid Common Mistakes:
- Do not generate the Python coee show me how to do the task, this is your task to extract the information and return it in JSON format.
Result
Output the final list of JSON objects, wrapped in <blocks>...</blocks> XML tags. Make sure to close the tag properly."""
Output the final list of JSON objects, wrapped in <blocks>...</blocks> XML tags. Make sure to close the tag properly."""

View File

@@ -643,7 +643,12 @@ def get_content_of_website_optimized(url: str, html: str, word_count_threshold:
return node
body = flatten_nested_elements(body)
base64_pattern = re.compile(r'data:image/[^;]+;base64,([^"]+)')
for img in imgs:
src = img.get('src', '')
if base64_pattern.match(src):
# Replace base64 data with empty string
img['src'] = base64_pattern.sub('', src)
cleaned_html = str(body).replace('\n\n', '\n').replace(' ', ' ')
cleaned_html = sanitize_html(cleaned_html)
@@ -725,7 +730,7 @@ def extract_xml_data(tags, string):
return data
# Function to perform the completion with exponential backoff
def perform_completion_with_backoff(provider, prompt_with_variables, api_token, json_response = False):
def perform_completion_with_backoff(provider, prompt_with_variables, api_token, json_response = False, base_url=None):
from litellm import completion
from litellm.exceptions import RateLimitError
max_attempts = 3
@@ -744,6 +749,7 @@ def perform_completion_with_backoff(provider, prompt_with_variables, api_token,
],
temperature=0.01,
api_key=api_token,
base_url=base_url,
**extra_args
)
return response # Return the successful response
@@ -764,7 +770,7 @@ def perform_completion_with_backoff(provider, prompt_with_variables, api_token,
"content": ["Rate limit error. Please try again later."]
}]
def extract_blocks(url, html, provider = DEFAULT_PROVIDER, api_token = None):
def extract_blocks(url, html, provider = DEFAULT_PROVIDER, api_token = None, base_url = None):
# api_token = os.getenv('GROQ_API_KEY', None) if not api_token else api_token
api_token = PROVIDER_MODELS.get(provider, None) if not api_token else api_token
@@ -779,7 +785,7 @@ def extract_blocks(url, html, provider = DEFAULT_PROVIDER, api_token = None):
"{" + variable + "}", variable_values[variable]
)
response = perform_completion_with_backoff(provider, prompt_with_variables, api_token)
response = perform_completion_with_backoff(provider, prompt_with_variables, api_token, base_url=base_url)
try:
blocks = extract_xml_data(["blocks"], response.choices[0].message.content)['blocks']
@@ -872,17 +878,17 @@ def merge_chunks_based_on_token_threshold(chunks, token_threshold):
return merged_sections
def process_sections(url: str, sections: list, provider: str, api_token: str) -> list:
def process_sections(url: str, sections: list, provider: str, api_token: str, base_url=None) -> list:
extracted_content = []
if provider.startswith("groq/"):
# Sequential processing with a delay
for section in sections:
extracted_content.extend(extract_blocks(url, section, provider, api_token))
extracted_content.extend(extract_blocks(url, section, provider, api_token, base_url=base_url))
time.sleep(0.5) # 500 ms delay between each processing
else:
# Parallel processing using ThreadPoolExecutor
with ThreadPoolExecutor() as executor:
futures = [executor.submit(extract_blocks, url, section, provider, api_token) for section in sections]
futures = [executor.submit(extract_blocks, url, section, provider, api_token, base_url=base_url) for section in sections]
for future in as_completed(futures):
extracted_content.extend(future.result())

View File

@@ -22,9 +22,10 @@ class WebCrawler:
crawler_strategy: CrawlerStrategy = None,
always_by_pass_cache: bool = False,
verbose: bool = False,
proxy: str = None,
):
# self.db_path = db_path
self.crawler_strategy = crawler_strategy or LocalSeleniumCrawlerStrategy(verbose=verbose)
self.crawler_strategy = crawler_strategy or LocalSeleniumCrawlerStrategy(verbose=verbose, proxy=proxy)
self.always_by_pass_cache = always_by_pass_cache
# Create the .crawl4ai folder in the user's home directory if it doesn't exist