Compare commits
2 Commits
proxy-supp
...
v0.2.76
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
7afa11a02f | ||
|
|
dec3d44224 |
4
.gitignore
vendored
4
.gitignore
vendored
@@ -189,4 +189,6 @@ a.txt
|
|||||||
.lambda_function.py
|
.lambda_function.py
|
||||||
ec2*
|
ec2*
|
||||||
|
|
||||||
update_changelog.sh
|
update_changelog.sh
|
||||||
|
test_env/
|
||||||
|
tmp/
|
||||||
27
README.md
27
README.md
@@ -190,33 +190,6 @@ result = crawler.run(
|
|||||||
print(result.extracted_content)
|
print(result.extracted_content)
|
||||||
```
|
```
|
||||||
|
|
||||||
### Extract Structured Data from Web Pages With Proxy and BaseUrl
|
|
||||||
|
|
||||||
```python
|
|
||||||
from crawl4ai import WebCrawler
|
|
||||||
from crawl4ai.extraction_strategy import LLMExtractionStrategy
|
|
||||||
|
|
||||||
def create_crawler():
|
|
||||||
crawler = WebCrawler(verbose=True, proxy="http://127.0.0.1:7890")
|
|
||||||
crawler.warmup()
|
|
||||||
return crawler
|
|
||||||
|
|
||||||
crawler = create_crawler()
|
|
||||||
|
|
||||||
crawler.warmup()
|
|
||||||
|
|
||||||
result = crawler.run(
|
|
||||||
url="https://www.nbcnews.com/business",
|
|
||||||
extraction_strategy=LLMExtractionStrategy(
|
|
||||||
provider="openai/gpt-4o",
|
|
||||||
api_token="sk-",
|
|
||||||
base_url="https://api.openai.com/v1"
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
print(result.markdown)
|
|
||||||
```
|
|
||||||
|
|
||||||
## Documentation 📚
|
## Documentation 📚
|
||||||
|
|
||||||
For detailed documentation, including installation instructions, advanced features, and API reference, visit our [Documentation Website](https://crawl4ai.com/mkdocs/).
|
For detailed documentation, including installation instructions, advanced features, and API reference, visit our [Documentation Website](https://crawl4ai.com/mkdocs/).
|
||||||
|
|||||||
@@ -82,8 +82,6 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
|
|||||||
print("[LOG] 🚀 Initializing LocalSeleniumCrawlerStrategy")
|
print("[LOG] 🚀 Initializing LocalSeleniumCrawlerStrategy")
|
||||||
self.options = Options()
|
self.options = Options()
|
||||||
self.options.headless = True
|
self.options.headless = True
|
||||||
if kwargs.get("proxy"):
|
|
||||||
self.options.add_argument("--proxy-server={}".format(kwargs.get("proxy")))
|
|
||||||
if kwargs.get("user_agent"):
|
if kwargs.get("user_agent"):
|
||||||
self.options.add_argument("--user-agent=" + kwargs.get("user_agent"))
|
self.options.add_argument("--user-agent=" + kwargs.get("user_agent"))
|
||||||
else:
|
else:
|
||||||
@@ -244,7 +242,6 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
|
|||||||
driver.quit()
|
driver.quit()
|
||||||
|
|
||||||
# Execute JS code if provided
|
# Execute JS code if provided
|
||||||
self.js_code = kwargs.get("js_code", self.js_code)
|
|
||||||
if self.js_code and type(self.js_code) == str:
|
if self.js_code and type(self.js_code) == str:
|
||||||
self.driver.execute_script(self.js_code)
|
self.driver.execute_script(self.js_code)
|
||||||
# Optionally, wait for some condition after executing the JS code
|
# Optionally, wait for some condition after executing the JS code
|
||||||
|
|||||||
@@ -79,7 +79,6 @@ class LLMExtractionStrategy(ExtractionStrategy):
|
|||||||
self.overlap_rate = kwargs.get("overlap_rate", OVERLAP_RATE)
|
self.overlap_rate = kwargs.get("overlap_rate", OVERLAP_RATE)
|
||||||
self.word_token_rate = kwargs.get("word_token_rate", WORD_TOKEN_RATE)
|
self.word_token_rate = kwargs.get("word_token_rate", WORD_TOKEN_RATE)
|
||||||
self.apply_chunking = kwargs.get("apply_chunking", True)
|
self.apply_chunking = kwargs.get("apply_chunking", True)
|
||||||
self.base_url = kwargs.get("base_url", None)
|
|
||||||
if not self.apply_chunking:
|
if not self.apply_chunking:
|
||||||
self.chunk_token_threshold = 1e9
|
self.chunk_token_threshold = 1e9
|
||||||
|
|
||||||
@@ -102,7 +101,7 @@ class LLMExtractionStrategy(ExtractionStrategy):
|
|||||||
variable_values["REQUEST"] = self.instruction
|
variable_values["REQUEST"] = self.instruction
|
||||||
prompt_with_variables = PROMPT_EXTRACT_BLOCKS_WITH_INSTRUCTION
|
prompt_with_variables = PROMPT_EXTRACT_BLOCKS_WITH_INSTRUCTION
|
||||||
|
|
||||||
if self.extract_type == "schema":
|
if self.extract_type == "schema" and self.schema:
|
||||||
variable_values["SCHEMA"] = json.dumps(self.schema, indent=2)
|
variable_values["SCHEMA"] = json.dumps(self.schema, indent=2)
|
||||||
prompt_with_variables = PROMPT_EXTRACT_SCHEMA_WITH_INSTRUCTION
|
prompt_with_variables = PROMPT_EXTRACT_SCHEMA_WITH_INSTRUCTION
|
||||||
|
|
||||||
@@ -111,7 +110,7 @@ class LLMExtractionStrategy(ExtractionStrategy):
|
|||||||
"{" + variable + "}", variable_values[variable]
|
"{" + variable + "}", variable_values[variable]
|
||||||
)
|
)
|
||||||
|
|
||||||
response = perform_completion_with_backoff(self.provider, prompt_with_variables, self.api_token, base_url=self.base_url) # , json_response=self.extract_type == "schema")
|
response = perform_completion_with_backoff(self.provider, prompt_with_variables, self.api_token) # , json_response=self.extract_type == "schema")
|
||||||
try:
|
try:
|
||||||
blocks = extract_xml_data(["blocks"], response.choices[0].message.content)['blocks']
|
blocks = extract_xml_data(["blocks"], response.choices[0].message.content)['blocks']
|
||||||
blocks = json.loads(blocks)
|
blocks = json.loads(blocks)
|
||||||
|
|||||||
@@ -29,7 +29,7 @@ To generate the JSON objects:
|
|||||||
|
|
||||||
5. Make sure the generated JSON is complete and parsable, with no errors or omissions.
|
5. Make sure the generated JSON is complete and parsable, with no errors or omissions.
|
||||||
|
|
||||||
6. Make sure to escape any special characters in the HTML content, and also single or double quote to avoid JSON parsing issues.
|
6. Make sur to escape any special characters in the HTML content, and also single or double quote to avoid JSON parsing issues.
|
||||||
|
|
||||||
Please provide your output within <blocks> tags, like this:
|
Please provide your output within <blocks> tags, like this:
|
||||||
|
|
||||||
@@ -87,7 +87,7 @@ To generate the JSON objects:
|
|||||||
|
|
||||||
5. Make sure the generated JSON is complete and parsable, with no errors or omissions.
|
5. Make sure the generated JSON is complete and parsable, with no errors or omissions.
|
||||||
|
|
||||||
6. Make sure to escape any special characters in the HTML content, and also single or double quote to avoid JSON parsing issues.
|
6. Make sur to escape any special characters in the HTML content, and also single or double quote to avoid JSON parsing issues.
|
||||||
|
|
||||||
7. Never alter the extracted content, just copy and paste it as it is.
|
7. Never alter the extracted content, just copy and paste it as it is.
|
||||||
|
|
||||||
@@ -142,7 +142,7 @@ To generate the JSON objects:
|
|||||||
|
|
||||||
5. Make sure the generated JSON is complete and parsable, with no errors or omissions.
|
5. Make sure the generated JSON is complete and parsable, with no errors or omissions.
|
||||||
|
|
||||||
6. Make sure to escape any special characters in the HTML content, and also single or double quote to avoid JSON parsing issues.
|
6. Make sur to escape any special characters in the HTML content, and also single or double quote to avoid JSON parsing issues.
|
||||||
|
|
||||||
7. Never alter the extracted content, just copy and paste it as it is.
|
7. Never alter the extracted content, just copy and paste it as it is.
|
||||||
|
|
||||||
@@ -201,4 +201,4 @@ Avoid Common Mistakes:
|
|||||||
- Do not generate the Python coee show me how to do the task, this is your task to extract the information and return it in JSON format.
|
- Do not generate the Python coee show me how to do the task, this is your task to extract the information and return it in JSON format.
|
||||||
|
|
||||||
Result
|
Result
|
||||||
Output the final list of JSON objects, wrapped in <blocks>...</blocks> XML tags. Make sure to close the tag properly."""
|
Output the final list of JSON objects, wrapped in <blocks>...</blocks> XML tags. Make sure to close the tag properly."""
|
||||||
@@ -634,12 +634,7 @@ def get_content_of_website_optimized(url: str, html: str, word_count_threshold:
|
|||||||
return node
|
return node
|
||||||
|
|
||||||
body = flatten_nested_elements(body)
|
body = flatten_nested_elements(body)
|
||||||
base64_pattern = re.compile(r'data:image/[^;]+;base64,([^"]+)')
|
|
||||||
for img in imgs:
|
|
||||||
src = img.get('src', '')
|
|
||||||
if base64_pattern.match(src):
|
|
||||||
# Replace base64 data with empty string
|
|
||||||
img['src'] = base64_pattern.sub('', src)
|
|
||||||
cleaned_html = str(body).replace('\n\n', '\n').replace(' ', ' ')
|
cleaned_html = str(body).replace('\n\n', '\n').replace(' ', ' ')
|
||||||
cleaned_html = sanitize_html(cleaned_html)
|
cleaned_html = sanitize_html(cleaned_html)
|
||||||
|
|
||||||
@@ -721,7 +716,7 @@ def extract_xml_data(tags, string):
|
|||||||
return data
|
return data
|
||||||
|
|
||||||
# Function to perform the completion with exponential backoff
|
# Function to perform the completion with exponential backoff
|
||||||
def perform_completion_with_backoff(provider, prompt_with_variables, api_token, json_response = False, base_url=None):
|
def perform_completion_with_backoff(provider, prompt_with_variables, api_token, json_response = False):
|
||||||
from litellm import completion
|
from litellm import completion
|
||||||
from litellm.exceptions import RateLimitError
|
from litellm.exceptions import RateLimitError
|
||||||
max_attempts = 3
|
max_attempts = 3
|
||||||
@@ -740,7 +735,6 @@ def perform_completion_with_backoff(provider, prompt_with_variables, api_token,
|
|||||||
],
|
],
|
||||||
temperature=0.01,
|
temperature=0.01,
|
||||||
api_key=api_token,
|
api_key=api_token,
|
||||||
base_url=base_url,
|
|
||||||
**extra_args
|
**extra_args
|
||||||
)
|
)
|
||||||
return response # Return the successful response
|
return response # Return the successful response
|
||||||
@@ -761,7 +755,7 @@ def perform_completion_with_backoff(provider, prompt_with_variables, api_token,
|
|||||||
"content": ["Rate limit error. Please try again later."]
|
"content": ["Rate limit error. Please try again later."]
|
||||||
}]
|
}]
|
||||||
|
|
||||||
def extract_blocks(url, html, provider = DEFAULT_PROVIDER, api_token = None, base_url = None):
|
def extract_blocks(url, html, provider = DEFAULT_PROVIDER, api_token = None):
|
||||||
# api_token = os.getenv('GROQ_API_KEY', None) if not api_token else api_token
|
# api_token = os.getenv('GROQ_API_KEY', None) if not api_token else api_token
|
||||||
api_token = PROVIDER_MODELS.get(provider, None) if not api_token else api_token
|
api_token = PROVIDER_MODELS.get(provider, None) if not api_token else api_token
|
||||||
|
|
||||||
@@ -776,7 +770,7 @@ def extract_blocks(url, html, provider = DEFAULT_PROVIDER, api_token = None, bas
|
|||||||
"{" + variable + "}", variable_values[variable]
|
"{" + variable + "}", variable_values[variable]
|
||||||
)
|
)
|
||||||
|
|
||||||
response = perform_completion_with_backoff(provider, prompt_with_variables, api_token, base_url=base_url)
|
response = perform_completion_with_backoff(provider, prompt_with_variables, api_token)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
blocks = extract_xml_data(["blocks"], response.choices[0].message.content)['blocks']
|
blocks = extract_xml_data(["blocks"], response.choices[0].message.content)['blocks']
|
||||||
@@ -840,7 +834,6 @@ def extract_blocks_batch(batch_data, provider = "groq/llama3-70b-8192", api_toke
|
|||||||
|
|
||||||
return sum(all_blocks, [])
|
return sum(all_blocks, [])
|
||||||
|
|
||||||
|
|
||||||
def merge_chunks_based_on_token_threshold(chunks, token_threshold):
|
def merge_chunks_based_on_token_threshold(chunks, token_threshold):
|
||||||
"""
|
"""
|
||||||
Merges small chunks into larger ones based on the total token threshold.
|
Merges small chunks into larger ones based on the total token threshold.
|
||||||
@@ -870,23 +863,22 @@ def merge_chunks_based_on_token_threshold(chunks, token_threshold):
|
|||||||
|
|
||||||
return merged_sections
|
return merged_sections
|
||||||
|
|
||||||
def process_sections(url: str, sections: list, provider: str, api_token: str, base_url=None) -> list:
|
def process_sections(url: str, sections: list, provider: str, api_token: str) -> list:
|
||||||
extracted_content = []
|
extracted_content = []
|
||||||
if provider.startswith("groq/"):
|
if provider.startswith("groq/"):
|
||||||
# Sequential processing with a delay
|
# Sequential processing with a delay
|
||||||
for section in sections:
|
for section in sections:
|
||||||
extracted_content.extend(extract_blocks(url, section, provider, api_token, base_url=base_url))
|
extracted_content.extend(extract_blocks(url, section, provider, api_token))
|
||||||
time.sleep(0.5) # 500 ms delay between each processing
|
time.sleep(0.5) # 500 ms delay between each processing
|
||||||
else:
|
else:
|
||||||
# Parallel processing using ThreadPoolExecutor
|
# Parallel processing using ThreadPoolExecutor
|
||||||
with ThreadPoolExecutor() as executor:
|
with ThreadPoolExecutor() as executor:
|
||||||
futures = [executor.submit(extract_blocks, url, section, provider, api_token, base_url=base_url) for section in sections]
|
futures = [executor.submit(extract_blocks, url, section, provider, api_token) for section in sections]
|
||||||
for future in as_completed(futures):
|
for future in as_completed(futures):
|
||||||
extracted_content.extend(future.result())
|
extracted_content.extend(future.result())
|
||||||
|
|
||||||
return extracted_content
|
return extracted_content
|
||||||
|
|
||||||
|
|
||||||
def wrap_text(draw, text, font, max_width):
|
def wrap_text(draw, text, font, max_width):
|
||||||
# Wrap the text to fit within the specified width
|
# Wrap the text to fit within the specified width
|
||||||
lines = []
|
lines = []
|
||||||
@@ -898,7 +890,6 @@ def wrap_text(draw, text, font, max_width):
|
|||||||
lines.append(line)
|
lines.append(line)
|
||||||
return '\n'.join(lines)
|
return '\n'.join(lines)
|
||||||
|
|
||||||
|
|
||||||
def format_html(html_string):
|
def format_html(html_string):
|
||||||
soup = BeautifulSoup(html_string, 'html.parser')
|
soup = BeautifulSoup(html_string, 'html.parser')
|
||||||
return soup.prettify()
|
return soup.prettify()
|
||||||
|
|||||||
@@ -22,10 +22,9 @@ class WebCrawler:
|
|||||||
crawler_strategy: CrawlerStrategy = None,
|
crawler_strategy: CrawlerStrategy = None,
|
||||||
always_by_pass_cache: bool = False,
|
always_by_pass_cache: bool = False,
|
||||||
verbose: bool = False,
|
verbose: bool = False,
|
||||||
proxy: str = None,
|
|
||||||
):
|
):
|
||||||
# self.db_path = db_path
|
# self.db_path = db_path
|
||||||
self.crawler_strategy = crawler_strategy or LocalSeleniumCrawlerStrategy(verbose=verbose, proxy=proxy)
|
self.crawler_strategy = crawler_strategy or LocalSeleniumCrawlerStrategy(verbose=verbose)
|
||||||
self.always_by_pass_cache = always_by_pass_cache
|
self.always_by_pass_cache = always_by_pass_cache
|
||||||
|
|
||||||
# Create the .crawl4ai folder in the user's home directory if it doesn't exist
|
# Create the .crawl4ai folder in the user's home directory if it doesn't exist
|
||||||
|
|||||||
2
setup.py
2
setup.py
@@ -19,7 +19,7 @@ with open("requirements.txt") as f:
|
|||||||
requirements = f.read().splitlines()
|
requirements = f.read().splitlines()
|
||||||
|
|
||||||
# Define the requirements for different environments
|
# Define the requirements for different environments
|
||||||
default_requirements = [req for req in requirements if not req.startswith(("torch", "transformers", "onnxruntime", "nltk", "spacy", "tokenizers", "scikit-learn", "numpy"))]
|
default_requirements = [req for req in requirements if not req.startswith(("torch", "transformers", "onnxruntime", "nltk", "spacy", "tokenizers", "scikit-learn"))]
|
||||||
torch_requirements = [req for req in requirements if req.startswith(("torch", "nltk", "spacy", "scikit-learn", "numpy"))]
|
torch_requirements = [req for req in requirements if req.startswith(("torch", "nltk", "spacy", "scikit-learn", "numpy"))]
|
||||||
transformer_requirements = [req for req in requirements if req.startswith(("transformers", "tokenizers", "onnxruntime"))]
|
transformer_requirements = [req for req in requirements if req.startswith(("transformers", "tokenizers", "onnxruntime"))]
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user