Compare commits
5 Commits
v0.2.76
...
proxy-supp
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
3caf48c9be | ||
|
|
2ba70b9501 | ||
|
|
16f98cebc0 | ||
|
|
fe9ff498ce | ||
|
|
eba831ca30 |
4
.gitignore
vendored
4
.gitignore
vendored
@@ -189,6 +189,4 @@ a.txt
|
||||
.lambda_function.py
|
||||
ec2*
|
||||
|
||||
update_changelog.sh
|
||||
test_env/
|
||||
tmp/
|
||||
update_changelog.sh
|
||||
27
README.md
27
README.md
@@ -190,6 +190,33 @@ result = crawler.run(
|
||||
print(result.extracted_content)
|
||||
```
|
||||
|
||||
### Extract Structured Data from Web Pages With Proxy and BaseUrl
|
||||
|
||||
```python
|
||||
from crawl4ai import WebCrawler
|
||||
from crawl4ai.extraction_strategy import LLMExtractionStrategy
|
||||
|
||||
def create_crawler():
|
||||
crawler = WebCrawler(verbose=True, proxy="http://127.0.0.1:7890")
|
||||
crawler.warmup()
|
||||
return crawler
|
||||
|
||||
crawler = create_crawler()
|
||||
|
||||
crawler.warmup()
|
||||
|
||||
result = crawler.run(
|
||||
url="https://www.nbcnews.com/business",
|
||||
extraction_strategy=LLMExtractionStrategy(
|
||||
provider="openai/gpt-4o",
|
||||
api_token="sk-",
|
||||
base_url="https://api.openai.com/v1"
|
||||
)
|
||||
)
|
||||
|
||||
print(result.markdown)
|
||||
```
|
||||
|
||||
## Documentation 📚
|
||||
|
||||
For detailed documentation, including installation instructions, advanced features, and API reference, visit our [Documentation Website](https://crawl4ai.com/mkdocs/).
|
||||
|
||||
@@ -82,6 +82,8 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
|
||||
print("[LOG] 🚀 Initializing LocalSeleniumCrawlerStrategy")
|
||||
self.options = Options()
|
||||
self.options.headless = True
|
||||
if kwargs.get("proxy"):
|
||||
self.options.add_argument("--proxy-server={}".format(kwargs.get("proxy")))
|
||||
if kwargs.get("user_agent"):
|
||||
self.options.add_argument("--user-agent=" + kwargs.get("user_agent"))
|
||||
else:
|
||||
@@ -242,6 +244,7 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
|
||||
driver.quit()
|
||||
|
||||
# Execute JS code if provided
|
||||
self.js_code = kwargs.get("js_code", self.js_code)
|
||||
if self.js_code and type(self.js_code) == str:
|
||||
self.driver.execute_script(self.js_code)
|
||||
# Optionally, wait for some condition after executing the JS code
|
||||
|
||||
@@ -79,6 +79,7 @@ class LLMExtractionStrategy(ExtractionStrategy):
|
||||
self.overlap_rate = kwargs.get("overlap_rate", OVERLAP_RATE)
|
||||
self.word_token_rate = kwargs.get("word_token_rate", WORD_TOKEN_RATE)
|
||||
self.apply_chunking = kwargs.get("apply_chunking", True)
|
||||
self.base_url = kwargs.get("base_url", None)
|
||||
if not self.apply_chunking:
|
||||
self.chunk_token_threshold = 1e9
|
||||
|
||||
@@ -101,7 +102,7 @@ class LLMExtractionStrategy(ExtractionStrategy):
|
||||
variable_values["REQUEST"] = self.instruction
|
||||
prompt_with_variables = PROMPT_EXTRACT_BLOCKS_WITH_INSTRUCTION
|
||||
|
||||
if self.extract_type == "schema" and self.schema:
|
||||
if self.extract_type == "schema":
|
||||
variable_values["SCHEMA"] = json.dumps(self.schema, indent=2)
|
||||
prompt_with_variables = PROMPT_EXTRACT_SCHEMA_WITH_INSTRUCTION
|
||||
|
||||
@@ -110,7 +111,7 @@ class LLMExtractionStrategy(ExtractionStrategy):
|
||||
"{" + variable + "}", variable_values[variable]
|
||||
)
|
||||
|
||||
response = perform_completion_with_backoff(self.provider, prompt_with_variables, self.api_token) # , json_response=self.extract_type == "schema")
|
||||
response = perform_completion_with_backoff(self.provider, prompt_with_variables, self.api_token, base_url=self.base_url) # , json_response=self.extract_type == "schema")
|
||||
try:
|
||||
blocks = extract_xml_data(["blocks"], response.choices[0].message.content)['blocks']
|
||||
blocks = json.loads(blocks)
|
||||
|
||||
@@ -29,7 +29,7 @@ To generate the JSON objects:
|
||||
|
||||
5. Make sure the generated JSON is complete and parsable, with no errors or omissions.
|
||||
|
||||
6. Make sur to escape any special characters in the HTML content, and also single or double quote to avoid JSON parsing issues.
|
||||
6. Make sure to escape any special characters in the HTML content, and also single or double quote to avoid JSON parsing issues.
|
||||
|
||||
Please provide your output within <blocks> tags, like this:
|
||||
|
||||
@@ -87,7 +87,7 @@ To generate the JSON objects:
|
||||
|
||||
5. Make sure the generated JSON is complete and parsable, with no errors or omissions.
|
||||
|
||||
6. Make sur to escape any special characters in the HTML content, and also single or double quote to avoid JSON parsing issues.
|
||||
6. Make sure to escape any special characters in the HTML content, and also single or double quote to avoid JSON parsing issues.
|
||||
|
||||
7. Never alter the extracted content, just copy and paste it as it is.
|
||||
|
||||
@@ -142,7 +142,7 @@ To generate the JSON objects:
|
||||
|
||||
5. Make sure the generated JSON is complete and parsable, with no errors or omissions.
|
||||
|
||||
6. Make sur to escape any special characters in the HTML content, and also single or double quote to avoid JSON parsing issues.
|
||||
6. Make sure to escape any special characters in the HTML content, and also single or double quote to avoid JSON parsing issues.
|
||||
|
||||
7. Never alter the extracted content, just copy and paste it as it is.
|
||||
|
||||
@@ -201,4 +201,4 @@ Avoid Common Mistakes:
|
||||
- Do not generate the Python coee show me how to do the task, this is your task to extract the information and return it in JSON format.
|
||||
|
||||
Result
|
||||
Output the final list of JSON objects, wrapped in <blocks>...</blocks> XML tags. Make sure to close the tag properly."""
|
||||
Output the final list of JSON objects, wrapped in <blocks>...</blocks> XML tags. Make sure to close the tag properly."""
|
||||
|
||||
@@ -634,7 +634,12 @@ def get_content_of_website_optimized(url: str, html: str, word_count_threshold:
|
||||
return node
|
||||
|
||||
body = flatten_nested_elements(body)
|
||||
|
||||
base64_pattern = re.compile(r'data:image/[^;]+;base64,([^"]+)')
|
||||
for img in imgs:
|
||||
src = img.get('src', '')
|
||||
if base64_pattern.match(src):
|
||||
# Replace base64 data with empty string
|
||||
img['src'] = base64_pattern.sub('', src)
|
||||
cleaned_html = str(body).replace('\n\n', '\n').replace(' ', ' ')
|
||||
cleaned_html = sanitize_html(cleaned_html)
|
||||
|
||||
@@ -716,7 +721,7 @@ def extract_xml_data(tags, string):
|
||||
return data
|
||||
|
||||
# Function to perform the completion with exponential backoff
|
||||
def perform_completion_with_backoff(provider, prompt_with_variables, api_token, json_response = False):
|
||||
def perform_completion_with_backoff(provider, prompt_with_variables, api_token, json_response = False, base_url=None):
|
||||
from litellm import completion
|
||||
from litellm.exceptions import RateLimitError
|
||||
max_attempts = 3
|
||||
@@ -735,6 +740,7 @@ def perform_completion_with_backoff(provider, prompt_with_variables, api_token,
|
||||
],
|
||||
temperature=0.01,
|
||||
api_key=api_token,
|
||||
base_url=base_url,
|
||||
**extra_args
|
||||
)
|
||||
return response # Return the successful response
|
||||
@@ -755,7 +761,7 @@ def perform_completion_with_backoff(provider, prompt_with_variables, api_token,
|
||||
"content": ["Rate limit error. Please try again later."]
|
||||
}]
|
||||
|
||||
def extract_blocks(url, html, provider = DEFAULT_PROVIDER, api_token = None):
|
||||
def extract_blocks(url, html, provider = DEFAULT_PROVIDER, api_token = None, base_url = None):
|
||||
# api_token = os.getenv('GROQ_API_KEY', None) if not api_token else api_token
|
||||
api_token = PROVIDER_MODELS.get(provider, None) if not api_token else api_token
|
||||
|
||||
@@ -770,7 +776,7 @@ def extract_blocks(url, html, provider = DEFAULT_PROVIDER, api_token = None):
|
||||
"{" + variable + "}", variable_values[variable]
|
||||
)
|
||||
|
||||
response = perform_completion_with_backoff(provider, prompt_with_variables, api_token)
|
||||
response = perform_completion_with_backoff(provider, prompt_with_variables, api_token, base_url=base_url)
|
||||
|
||||
try:
|
||||
blocks = extract_xml_data(["blocks"], response.choices[0].message.content)['blocks']
|
||||
@@ -834,6 +840,7 @@ def extract_blocks_batch(batch_data, provider = "groq/llama3-70b-8192", api_toke
|
||||
|
||||
return sum(all_blocks, [])
|
||||
|
||||
|
||||
def merge_chunks_based_on_token_threshold(chunks, token_threshold):
|
||||
"""
|
||||
Merges small chunks into larger ones based on the total token threshold.
|
||||
@@ -863,22 +870,23 @@ def merge_chunks_based_on_token_threshold(chunks, token_threshold):
|
||||
|
||||
return merged_sections
|
||||
|
||||
def process_sections(url: str, sections: list, provider: str, api_token: str) -> list:
|
||||
def process_sections(url: str, sections: list, provider: str, api_token: str, base_url=None) -> list:
|
||||
extracted_content = []
|
||||
if provider.startswith("groq/"):
|
||||
# Sequential processing with a delay
|
||||
for section in sections:
|
||||
extracted_content.extend(extract_blocks(url, section, provider, api_token))
|
||||
extracted_content.extend(extract_blocks(url, section, provider, api_token, base_url=base_url))
|
||||
time.sleep(0.5) # 500 ms delay between each processing
|
||||
else:
|
||||
# Parallel processing using ThreadPoolExecutor
|
||||
with ThreadPoolExecutor() as executor:
|
||||
futures = [executor.submit(extract_blocks, url, section, provider, api_token) for section in sections]
|
||||
futures = [executor.submit(extract_blocks, url, section, provider, api_token, base_url=base_url) for section in sections]
|
||||
for future in as_completed(futures):
|
||||
extracted_content.extend(future.result())
|
||||
|
||||
return extracted_content
|
||||
|
||||
|
||||
def wrap_text(draw, text, font, max_width):
|
||||
# Wrap the text to fit within the specified width
|
||||
lines = []
|
||||
@@ -890,6 +898,7 @@ def wrap_text(draw, text, font, max_width):
|
||||
lines.append(line)
|
||||
return '\n'.join(lines)
|
||||
|
||||
|
||||
def format_html(html_string):
|
||||
soup = BeautifulSoup(html_string, 'html.parser')
|
||||
return soup.prettify()
|
||||
|
||||
@@ -22,9 +22,10 @@ class WebCrawler:
|
||||
crawler_strategy: CrawlerStrategy = None,
|
||||
always_by_pass_cache: bool = False,
|
||||
verbose: bool = False,
|
||||
proxy: str = None,
|
||||
):
|
||||
# self.db_path = db_path
|
||||
self.crawler_strategy = crawler_strategy or LocalSeleniumCrawlerStrategy(verbose=verbose)
|
||||
self.crawler_strategy = crawler_strategy or LocalSeleniumCrawlerStrategy(verbose=verbose, proxy=proxy)
|
||||
self.always_by_pass_cache = always_by_pass_cache
|
||||
|
||||
# Create the .crawl4ai folder in the user's home directory if it doesn't exist
|
||||
|
||||
2
setup.py
2
setup.py
@@ -19,7 +19,7 @@ with open("requirements.txt") as f:
|
||||
requirements = f.read().splitlines()
|
||||
|
||||
# Define the requirements for different environments
|
||||
default_requirements = [req for req in requirements if not req.startswith(("torch", "transformers", "onnxruntime", "nltk", "spacy", "tokenizers", "scikit-learn"))]
|
||||
default_requirements = [req for req in requirements if not req.startswith(("torch", "transformers", "onnxruntime", "nltk", "spacy", "tokenizers", "scikit-learn", "numpy"))]
|
||||
torch_requirements = [req for req in requirements if req.startswith(("torch", "nltk", "spacy", "scikit-learn", "numpy"))]
|
||||
transformer_requirements = [req for req in requirements if req.startswith(("transformers", "tokenizers", "onnxruntime"))]
|
||||
|
||||
|
||||
Reference in New Issue
Block a user