Compare commits
6 Commits
v0.2.74
...
main-img-c
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
8463aabedf | ||
|
|
7f30144ef2 | ||
|
|
fa5516aad6 | ||
|
|
ca0336af9e | ||
|
|
65ed1aeade | ||
|
|
4d283ab386 |
2
.gitignore
vendored
2
.gitignore
vendored
@@ -165,6 +165,8 @@ Crawl4AI.egg-info/
|
|||||||
Crawl4AI.egg-info/*
|
Crawl4AI.egg-info/*
|
||||||
crawler_data.db
|
crawler_data.db
|
||||||
.vscode/
|
.vscode/
|
||||||
|
.tests/
|
||||||
|
.test_pads/
|
||||||
test_pad.py
|
test_pad.py
|
||||||
test_pad*.py
|
test_pad*.py
|
||||||
.data/
|
.data/
|
||||||
|
|||||||
@@ -1,5 +1,14 @@
|
|||||||
# Changelog
|
# Changelog
|
||||||
|
|
||||||
|
## [v0.2.74] - 2024-07-08
|
||||||
|
A slew of exciting updates to improve the crawler's stability and robustness! 🎉
|
||||||
|
|
||||||
|
- 💻 **UTF encoding fix**: Resolved the Windows \"charmap\" error by adding UTF encoding.
|
||||||
|
- 🛡️ **Error handling**: Implemented MaxRetryError exception handling in LocalSeleniumCrawlerStrategy.
|
||||||
|
- 🧹 **Input sanitization**: Improved input sanitization and handled encoding issues in LLMExtractionStrategy.
|
||||||
|
- 🚮 **Database cleanup**: Removed existing database file and initialized a new one.
|
||||||
|
|
||||||
|
|
||||||
## [v0.2.73] - 2024-07-03
|
## [v0.2.73] - 2024-07-03
|
||||||
|
|
||||||
💡 In this release, we've bumped the version to v0.2.73 and refreshed our documentation to ensure you have the best experience with our project.
|
💡 In this release, we've bumped the version to v0.2.73 and refreshed our documentation to ensure you have the best experience with our project.
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
# Crawl4AI v0.2.73 🕷️🤖
|
# Crawl4AI v0.2.74 🕷️🤖
|
||||||
|
|
||||||
[](https://github.com/unclecode/crawl4ai/stargazers)
|
[](https://github.com/unclecode/crawl4ai/stargazers)
|
||||||
[](https://github.com/unclecode/crawl4ai/network/members)
|
[](https://github.com/unclecode/crawl4ai/network/members)
|
||||||
|
|||||||
@@ -3,6 +3,7 @@ import re
|
|||||||
from collections import Counter
|
from collections import Counter
|
||||||
import string
|
import string
|
||||||
from .model_loader import load_nltk_punkt
|
from .model_loader import load_nltk_punkt
|
||||||
|
from .utils import *
|
||||||
|
|
||||||
# Define the abstract base class for chunking strategies
|
# Define the abstract base class for chunking strategies
|
||||||
class ChunkingStrategy(ABC):
|
class ChunkingStrategy(ABC):
|
||||||
|
|||||||
@@ -8,6 +8,7 @@ from selenium.webdriver.chrome.options import Options
|
|||||||
from selenium.common.exceptions import InvalidArgumentException, WebDriverException
|
from selenium.common.exceptions import InvalidArgumentException, WebDriverException
|
||||||
from selenium.webdriver.chrome.service import Service as ChromeService
|
from selenium.webdriver.chrome.service import Service as ChromeService
|
||||||
from webdriver_manager.chrome import ChromeDriverManager
|
from webdriver_manager.chrome import ChromeDriverManager
|
||||||
|
from urllib3.exceptions import MaxRetryError
|
||||||
|
|
||||||
from .config import *
|
from .config import *
|
||||||
import logging, time
|
import logging, time
|
||||||
@@ -18,7 +19,7 @@ from typing import List, Callable
|
|||||||
import requests
|
import requests
|
||||||
import os
|
import os
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from .utils import wrap_text
|
from .utils import *
|
||||||
|
|
||||||
logger = logging.getLogger('selenium.webdriver.remote.remote_connection')
|
logger = logging.getLogger('selenium.webdriver.remote.remote_connection')
|
||||||
logger.setLevel(logging.WARNING)
|
logger.setLevel(logging.WARNING)
|
||||||
@@ -73,7 +74,7 @@ class CloudCrawlerStrategy(CrawlerStrategy):
|
|||||||
response = requests.post("http://crawl4ai.uccode.io/crawl", json=data)
|
response = requests.post("http://crawl4ai.uccode.io/crawl", json=data)
|
||||||
response = response.json()
|
response = response.json()
|
||||||
html = response["results"][0]["html"]
|
html = response["results"][0]["html"]
|
||||||
return html
|
return sanitize_input_encode(html)
|
||||||
|
|
||||||
class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
|
class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
|
||||||
def __init__(self, use_cached_html=False, js_code=None, **kwargs):
|
def __init__(self, use_cached_html=False, js_code=None, **kwargs):
|
||||||
@@ -200,7 +201,7 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
|
|||||||
cache_file_path = os.path.join(Path.home(), ".crawl4ai", "cache", url_hash)
|
cache_file_path = os.path.join(Path.home(), ".crawl4ai", "cache", url_hash)
|
||||||
if os.path.exists(cache_file_path):
|
if os.path.exists(cache_file_path):
|
||||||
with open(cache_file_path, "r") as f:
|
with open(cache_file_path, "r") as f:
|
||||||
return f.read()
|
return sanitize_input_encode(f.read())
|
||||||
|
|
||||||
try:
|
try:
|
||||||
self.driver = self.execute_hook('before_get_url', self.driver)
|
self.driver = self.execute_hook('before_get_url', self.driver)
|
||||||
@@ -214,11 +215,15 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
|
|||||||
WebDriverWait(self.driver, 10).until(
|
WebDriverWait(self.driver, 10).until(
|
||||||
EC.presence_of_all_elements_located((By.TAG_NAME, "body"))
|
EC.presence_of_all_elements_located((By.TAG_NAME, "body"))
|
||||||
)
|
)
|
||||||
|
|
||||||
self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
|
self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
|
||||||
html = self._ensure_page_load() # self.driver.page_source
|
|
||||||
|
self.driver = self.execute_hook('after_get_url', self.driver)
|
||||||
|
html = sanitize_input_encode(self._ensure_page_load()) # self.driver.page_source
|
||||||
can_not_be_done_headless = False # Look at my creativity for naming variables
|
can_not_be_done_headless = False # Look at my creativity for naming variables
|
||||||
# TODO: Very ugly way for now but it works
|
|
||||||
if not kwargs.get('bypass_headless', False) and html == "<html><head></head><body></body></html>":
|
# TODO: Very ugly approach, but promise to change it!
|
||||||
|
if kwargs.get('bypass_headless', False) or html == "<html><head></head><body></body></html>":
|
||||||
print("[LOG] 🙌 Page could not be loaded in headless mode. Trying non-headless mode...")
|
print("[LOG] 🙌 Page could not be loaded in headless mode. Trying non-headless mode...")
|
||||||
can_not_be_done_headless = True
|
can_not_be_done_headless = True
|
||||||
options = Options()
|
options = Options()
|
||||||
@@ -227,11 +232,10 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
|
|||||||
options.add_argument("--window-size=5,5")
|
options.add_argument("--window-size=5,5")
|
||||||
driver = webdriver.Chrome(service=self.service, options=options)
|
driver = webdriver.Chrome(service=self.service, options=options)
|
||||||
driver.get(url)
|
driver.get(url)
|
||||||
html = driver.page_source
|
self.driver = self.execute_hook('after_get_url', driver)
|
||||||
|
html = sanitize_input_encode(driver.page_source)
|
||||||
driver.quit()
|
driver.quit()
|
||||||
|
|
||||||
self.driver = self.execute_hook('after_get_url', self.driver)
|
|
||||||
|
|
||||||
# Execute JS code if provided
|
# Execute JS code if provided
|
||||||
if self.js_code and type(self.js_code) == str:
|
if self.js_code and type(self.js_code) == str:
|
||||||
self.driver.execute_script(self.js_code)
|
self.driver.execute_script(self.js_code)
|
||||||
@@ -247,12 +251,12 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
|
|||||||
)
|
)
|
||||||
|
|
||||||
if not can_not_be_done_headless:
|
if not can_not_be_done_headless:
|
||||||
html = self.driver.page_source
|
html = sanitize_input_encode(self.driver.page_source)
|
||||||
self.driver = self.execute_hook('before_return_html', self.driver, html)
|
self.driver = self.execute_hook('before_return_html', self.driver, html)
|
||||||
|
|
||||||
# Store in cache
|
# Store in cache
|
||||||
cache_file_path = os.path.join(Path.home(), ".crawl4ai", "cache", url_hash)
|
cache_file_path = os.path.join(Path.home(), ".crawl4ai", "cache", url_hash)
|
||||||
with open(cache_file_path, "w") as f:
|
with open(cache_file_path, "w", encoding="utf-8") as f:
|
||||||
f.write(html)
|
f.write(html)
|
||||||
|
|
||||||
if self.verbose:
|
if self.verbose:
|
||||||
@@ -261,16 +265,16 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
|
|||||||
return html
|
return html
|
||||||
except InvalidArgumentException:
|
except InvalidArgumentException:
|
||||||
if not hasattr(e, 'msg'):
|
if not hasattr(e, 'msg'):
|
||||||
e.msg = str(e)
|
e.msg = sanitize_input_encode(str(e))
|
||||||
raise InvalidArgumentException(f"Failed to crawl {url}: {e.msg}")
|
raise InvalidArgumentException(f"Failed to crawl {url}: {e.msg}")
|
||||||
except WebDriverException as e:
|
except WebDriverException as e:
|
||||||
# If e does nlt have msg attribute create it and set it to str(e)
|
# If e does nlt have msg attribute create it and set it to str(e)
|
||||||
if not hasattr(e, 'msg'):
|
if not hasattr(e, 'msg'):
|
||||||
e.msg = str(e)
|
e.msg = sanitize_input_encode(str(e))
|
||||||
raise WebDriverException(f"Failed to crawl {url}: {e.msg}")
|
raise WebDriverException(f"Failed to crawl {url}: {e.msg}")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
if not hasattr(e, 'msg'):
|
if not hasattr(e, 'msg'):
|
||||||
e.msg = str(e)
|
e.msg = sanitize_input_encode(str(e))
|
||||||
raise Exception(f"Failed to crawl {url}: {e.msg}")
|
raise Exception(f"Failed to crawl {url}: {e.msg}")
|
||||||
|
|
||||||
def take_screenshot(self) -> str:
|
def take_screenshot(self) -> str:
|
||||||
@@ -299,7 +303,7 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
|
|||||||
return img_base64
|
return img_base64
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
error_message = f"Failed to take screenshot: {str(e)}"
|
error_message = sanitize_input_encode(f"Failed to take screenshot: {str(e)}")
|
||||||
print(error_message)
|
print(error_message)
|
||||||
|
|
||||||
# Generate an image with black background
|
# Generate an image with black background
|
||||||
|
|||||||
@@ -20,7 +20,7 @@ def init_db():
|
|||||||
extracted_content TEXT,
|
extracted_content TEXT,
|
||||||
success BOOLEAN,
|
success BOOLEAN,
|
||||||
media TEXT DEFAULT "{}",
|
media TEXT DEFAULT "{}",
|
||||||
link TEXT DEFAULT "{}",
|
links TEXT DEFAULT "{}",
|
||||||
metadata TEXT DEFAULT "{}",
|
metadata TEXT DEFAULT "{}",
|
||||||
screenshot TEXT DEFAULT ""
|
screenshot TEXT DEFAULT ""
|
||||||
)
|
)
|
||||||
@@ -127,6 +127,9 @@ def update_existing_records(new_column: str = "media", default_value: str = "{}"
|
|||||||
print(f"Error updating existing records: {e}")
|
print(f"Error updating existing records: {e}")
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
init_db() # Initialize the database if not already initialized
|
# Delete the existing database file
|
||||||
alter_db_add_screenshot("metadata") # Add the new column to the table
|
if os.path.exists(DB_PATH):
|
||||||
update_existing_records("metadata") # Update existing records to set the new column to an empty string
|
os.remove(DB_PATH)
|
||||||
|
init_db()
|
||||||
|
# alter_db_add_screenshot("COL_NAME")
|
||||||
|
|
||||||
|
|||||||
@@ -116,7 +116,6 @@ class LLMExtractionStrategy(ExtractionStrategy):
|
|||||||
for block in blocks:
|
for block in blocks:
|
||||||
block['error'] = False
|
block['error'] = False
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print("Error extracting blocks:", str(e))
|
|
||||||
parsed, unparsed = split_and_parse_json_objects(response.choices[0].message.content)
|
parsed, unparsed = split_and_parse_json_objects(response.choices[0].message.content)
|
||||||
blocks = parsed
|
blocks = parsed
|
||||||
if unparsed:
|
if unparsed:
|
||||||
@@ -192,7 +191,7 @@ class LLMExtractionStrategy(ExtractionStrategy):
|
|||||||
# Sequential processing with a delay
|
# Sequential processing with a delay
|
||||||
for ix, section in enumerate(merged_sections):
|
for ix, section in enumerate(merged_sections):
|
||||||
extract_func = partial(self.extract, url)
|
extract_func = partial(self.extract, url)
|
||||||
extracted_content.extend(extract_func(ix, section))
|
extracted_content.extend(extract_func(ix, sanitize_input_encode(section)))
|
||||||
time.sleep(0.5) # 500 ms delay between each processing
|
time.sleep(0.5) # 500 ms delay between each processing
|
||||||
else:
|
else:
|
||||||
# Parallel processing using ThreadPoolExecutor
|
# Parallel processing using ThreadPoolExecutor
|
||||||
@@ -202,10 +201,21 @@ class LLMExtractionStrategy(ExtractionStrategy):
|
|||||||
|
|
||||||
with ThreadPoolExecutor(max_workers=4) as executor:
|
with ThreadPoolExecutor(max_workers=4) as executor:
|
||||||
extract_func = partial(self.extract, url)
|
extract_func = partial(self.extract, url)
|
||||||
futures = [executor.submit(extract_func, ix, section) for ix, section in enumerate(merged_sections)]
|
futures = [executor.submit(extract_func, ix, sanitize_input_encode(section)) for ix, section in enumerate(merged_sections)]
|
||||||
|
|
||||||
for future in as_completed(futures):
|
for future in as_completed(futures):
|
||||||
extracted_content.extend(future.result())
|
try:
|
||||||
|
extracted_content.extend(future.result())
|
||||||
|
except Exception as e:
|
||||||
|
if self.verbose:
|
||||||
|
print(f"Error in thread execution: {e}")
|
||||||
|
# Add error information to extracted_content
|
||||||
|
extracted_content.append({
|
||||||
|
"index": 0,
|
||||||
|
"error": True,
|
||||||
|
"tags": ["error"],
|
||||||
|
"content": str(e)
|
||||||
|
})
|
||||||
|
|
||||||
|
|
||||||
return extracted_content
|
return extracted_content
|
||||||
|
|||||||
@@ -96,6 +96,16 @@ def sanitize_html(html):
|
|||||||
|
|
||||||
return sanitized_html
|
return sanitized_html
|
||||||
|
|
||||||
|
def sanitize_input_encode(text: str) -> str:
|
||||||
|
"""Sanitize input to handle potential encoding issues."""
|
||||||
|
try:
|
||||||
|
# Attempt to encode and decode as UTF-8 to handle potential encoding issues
|
||||||
|
return text.encode('utf-8', errors='ignore').decode('utf-8')
|
||||||
|
except UnicodeEncodeError as e:
|
||||||
|
print(f"Warning: Encoding issue detected. Some characters may be lost. Error: {e}")
|
||||||
|
# Fall back to ASCII if UTF-8 fails
|
||||||
|
return text.encode('ascii', errors='ignore').decode('ascii')
|
||||||
|
|
||||||
def escape_json_string(s):
|
def escape_json_string(s):
|
||||||
"""
|
"""
|
||||||
Escapes characters in a string to be JSON safe.
|
Escapes characters in a string to be JSON safe.
|
||||||
@@ -664,7 +674,6 @@ def extract_blocks(url, html, provider = DEFAULT_PROVIDER, api_token = None):
|
|||||||
for block in blocks:
|
for block in blocks:
|
||||||
block['error'] = False
|
block['error'] = False
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print("Error extracting blocks:", str(e))
|
|
||||||
parsed, unparsed = split_and_parse_json_objects(response.choices[0].message.content)
|
parsed, unparsed = split_and_parse_json_objects(response.choices[0].message.content)
|
||||||
blocks = parsed
|
blocks = parsed
|
||||||
# Append all unparsed segments as onr error block and content is list of unparsed segments
|
# Append all unparsed segments as onr error block and content is list of unparsed segments
|
||||||
@@ -710,7 +719,6 @@ def extract_blocks_batch(batch_data, provider = "groq/llama3-70b-8192", api_toke
|
|||||||
blocks = json.loads(blocks)
|
blocks = json.loads(blocks)
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print("Error extracting blocks:", str(e))
|
|
||||||
blocks = [{
|
blocks = [{
|
||||||
"index": 0,
|
"index": 0,
|
||||||
"tags": ["error"],
|
"tags": ["error"],
|
||||||
|
|||||||
@@ -155,8 +155,8 @@ class WebCrawler:
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
if cached:
|
if cached:
|
||||||
html = cached[1]
|
html = sanitize_input_encode(cached[1])
|
||||||
extracted_content = cached[4]
|
extracted_content = sanitize_input_encode(cached[4])
|
||||||
if screenshot:
|
if screenshot:
|
||||||
screenshot_data = cached[9]
|
screenshot_data = cached[9]
|
||||||
if not screenshot_data:
|
if not screenshot_data:
|
||||||
@@ -166,7 +166,7 @@ class WebCrawler:
|
|||||||
if user_agent:
|
if user_agent:
|
||||||
self.crawler_strategy.update_user_agent(user_agent)
|
self.crawler_strategy.update_user_agent(user_agent)
|
||||||
t1 = time.time()
|
t1 = time.time()
|
||||||
html = self.crawler_strategy.crawl(url, **kwargs)
|
html = sanitize_input_encode(self.crawler_strategy.crawl(url, **kwargs))
|
||||||
t2 = time.time()
|
t2 = time.time()
|
||||||
if verbose:
|
if verbose:
|
||||||
print(f"[LOG] 🚀 Crawling done for {url}, success: {bool(html)}, time taken: {t2 - t1} seconds")
|
print(f"[LOG] 🚀 Crawling done for {url}, success: {bool(html)}, time taken: {t2 - t1} seconds")
|
||||||
@@ -213,8 +213,8 @@ class WebCrawler:
|
|||||||
except InvalidCSSSelectorError as e:
|
except InvalidCSSSelectorError as e:
|
||||||
raise ValueError(str(e))
|
raise ValueError(str(e))
|
||||||
|
|
||||||
cleaned_html = result.get("cleaned_html", "")
|
cleaned_html = sanitize_input_encode(result.get("cleaned_html", ""))
|
||||||
markdown = result.get("markdown", "")
|
markdown = sanitize_input_encode(result.get("markdown", ""))
|
||||||
media = result.get("media", [])
|
media = result.get("media", [])
|
||||||
links = result.get("links", [])
|
links = result.get("links", [])
|
||||||
metadata = result.get("metadata", {})
|
metadata = result.get("metadata", {})
|
||||||
|
|||||||
@@ -36,5 +36,5 @@ model_fees = json.loads(result.extracted_content)
|
|||||||
|
|
||||||
print(len(model_fees))
|
print(len(model_fees))
|
||||||
|
|
||||||
with open(".data/data.json", "w") as f:
|
with open(".data/data.json", "w", encoding="utf-8") as f:
|
||||||
f.write(result.extracted_content)
|
f.write(result.extracted_content)
|
||||||
@@ -249,15 +249,40 @@ def using_crawler_hooks(crawler):
|
|||||||
|
|
||||||
cprint("\n🔗 [bold cyan]Using Crawler Hooks: Let's see how we can customize the crawler using hooks![/bold cyan]", True)
|
cprint("\n🔗 [bold cyan]Using Crawler Hooks: Let's see how we can customize the crawler using hooks![/bold cyan]", True)
|
||||||
|
|
||||||
crawler.set_hook('on_driver_created', on_driver_created)
|
crawler_strategy = LocalSeleniumCrawlerStrategy(verbose=True)
|
||||||
crawler.set_hook('before_get_url', before_get_url)
|
crawler_strategy.set_hook('on_driver_created', on_driver_created)
|
||||||
crawler.set_hook('after_get_url', after_get_url)
|
crawler_strategy.set_hook('before_get_url', before_get_url)
|
||||||
crawler.set_hook('before_return_html', before_return_html)
|
crawler_strategy.set_hook('after_get_url', after_get_url)
|
||||||
|
crawler_strategy.set_hook('before_return_html', before_return_html)
|
||||||
|
|
||||||
|
crawler = WebCrawler(verbose=True, crawler_strategy=crawler_strategy)
|
||||||
|
crawler.warmup()
|
||||||
result = crawler.run(url="https://example.com")
|
result = crawler.run(url="https://example.com")
|
||||||
|
|
||||||
cprint("[LOG] 📦 [bold yellow]Crawler Hooks result:[/bold yellow]")
|
cprint("[LOG] 📦 [bold yellow]Crawler Hooks result:[/bold yellow]")
|
||||||
print_result(result= result)
|
print_result(result= result)
|
||||||
|
|
||||||
|
def using_crawler_hooks_dleay_example(crawler):
|
||||||
|
def delay(driver):
|
||||||
|
print("Delaying for 5 seconds...")
|
||||||
|
time.sleep(5)
|
||||||
|
print("Resuming...")
|
||||||
|
|
||||||
|
def create_crawler():
|
||||||
|
crawler_strategy = LocalSeleniumCrawlerStrategy(verbose=True)
|
||||||
|
crawler_strategy.set_hook('after_get_url', delay)
|
||||||
|
crawler = WebCrawler(verbose=True, crawler_strategy=crawler_strategy)
|
||||||
|
crawler.warmup()
|
||||||
|
return crawler
|
||||||
|
|
||||||
|
cprint("\n🔗 [bold cyan]Using Crawler Hooks: Let's add a delay after fetching the url to make sure entire page is fetched.[/bold cyan]")
|
||||||
|
crawler = create_crawler()
|
||||||
|
result = crawler.run(url="https://google.com", bypass_cache=True)
|
||||||
|
|
||||||
|
cprint("[LOG] 📦 [bold yellow]Crawler Hooks result:[/bold yellow]")
|
||||||
|
print_result(result)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
cprint("🌟 [bold green]Welcome to the Crawl4ai Quickstart Guide! Let's dive into some web crawling fun! 🌐[/bold green]")
|
cprint("🌟 [bold green]Welcome to the Crawl4ai Quickstart Guide! Let's dive into some web crawling fun! 🌐[/bold green]")
|
||||||
|
|||||||
@@ -42,5 +42,5 @@ page_summary = json.loads(result.extracted_content)
|
|||||||
|
|
||||||
print(page_summary)
|
print(page_summary)
|
||||||
|
|
||||||
with open(".data/page_summary.json", "w") as f:
|
with open(".data/page_summary.json", "w", encoding="utf-8") as f:
|
||||||
f.write(result.extracted_content)
|
f.write(result.extracted_content)
|
||||||
|
|||||||
@@ -1,5 +1,13 @@
|
|||||||
# Changelog
|
# Changelog
|
||||||
|
|
||||||
|
## v0.2.74 - 2024-07-08
|
||||||
|
A slew of exciting updates to improve the crawler's stability and robustness! 🎉
|
||||||
|
|
||||||
|
- 💻 **UTF encoding fix**: Resolved the Windows \"charmap\" error by adding UTF encoding.
|
||||||
|
- 🛡️ **Error handling**: Implemented MaxRetryError exception handling in LocalSeleniumCrawlerStrategy.
|
||||||
|
- 🧹 **Input sanitization**: Improved input sanitization and handled encoding issues in LLMExtractionStrategy.
|
||||||
|
- 🚮 **Database cleanup**: Removed existing database file and initialized a new one.
|
||||||
|
|
||||||
## [v0.2.73] - 2024-07-03
|
## [v0.2.73] - 2024-07-03
|
||||||
|
|
||||||
💡 In this release, we've bumped the version to v0.2.73 and refreshed our documentation to ensure you have the best experience with our project.
|
💡 In this release, we've bumped the version to v0.2.73 and refreshed our documentation to ensure you have the best experience with our project.
|
||||||
|
|||||||
@@ -14,6 +14,7 @@
|
|||||||
<div class="form-group">
|
<div class="form-group">
|
||||||
<button class="btn btn-default" type="submit">Submit</button>
|
<button class="btn btn-default" type="submit">Submit</button>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
</fieldset>
|
</fieldset>
|
||||||
</form>
|
</form>
|
||||||
|
|
||||||
@@ -93,6 +94,10 @@
|
|||||||
</div>
|
</div>
|
||||||
</section>
|
</section>
|
||||||
|
|
||||||
|
<div id="error" class="error-message" style="display: none; margin-top:1em;">
|
||||||
|
<div class="terminal-alert terminal-alert-error"></div>
|
||||||
|
</div>
|
||||||
|
|
||||||
<script>
|
<script>
|
||||||
function showTab(tabId) {
|
function showTab(tabId) {
|
||||||
const tabs = document.querySelectorAll('.tab-content');
|
const tabs = document.querySelectorAll('.tab-content');
|
||||||
@@ -162,7 +167,17 @@
|
|||||||
},
|
},
|
||||||
body: JSON.stringify(data)
|
body: JSON.stringify(data)
|
||||||
})
|
})
|
||||||
.then(response => response.json())
|
.then(response => {
|
||||||
|
if (!response.ok) {
|
||||||
|
if (response.status === 429) {
|
||||||
|
return response.json().then(err => {
|
||||||
|
throw Object.assign(new Error('Rate limit exceeded'), { status: 429, details: err });
|
||||||
|
});
|
||||||
|
}
|
||||||
|
throw new Error('Network response was not ok');
|
||||||
|
}
|
||||||
|
return response.json();
|
||||||
|
})
|
||||||
.then(data => {
|
.then(data => {
|
||||||
data = data.results[0]; // Only one URL is requested
|
data = data.results[0]; // Only one URL is requested
|
||||||
document.getElementById('loading').style.display = 'none';
|
document.getElementById('loading').style.display = 'none';
|
||||||
@@ -187,11 +202,29 @@ result = crawler.run(
|
|||||||
print(result)
|
print(result)
|
||||||
`;
|
`;
|
||||||
redo(document.getElementById('pythonCode'), pythonCode);
|
redo(document.getElementById('pythonCode'), pythonCode);
|
||||||
|
document.getElementById('error').style.display = 'none';
|
||||||
})
|
})
|
||||||
.catch(error => {
|
.catch(error => {
|
||||||
document.getElementById('loading').style.display = 'none';
|
document.getElementById('loading').style.display = 'none';
|
||||||
document.getElementById('response').style.display = 'block';
|
document.getElementById('error').style.display = 'block';
|
||||||
document.getElementById('markdownContent').textContent = 'Error: ' + error;
|
let errorMessage = 'An unexpected error occurred. Please try again later.';
|
||||||
|
|
||||||
|
if (error.status === 429) {
|
||||||
|
const details = error.details;
|
||||||
|
if (details.retry_after) {
|
||||||
|
errorMessage = `Rate limit exceeded. Please wait ${parseFloat(details.retry_after).toFixed(1)} seconds before trying again.`;
|
||||||
|
} else if (details.reset_at) {
|
||||||
|
const resetTime = new Date(details.reset_at);
|
||||||
|
const waitTime = Math.ceil((resetTime - new Date()) / 1000);
|
||||||
|
errorMessage = `Rate limit exceeded. Please try again after ${waitTime} seconds.`;
|
||||||
|
} else {
|
||||||
|
errorMessage = `Rate limit exceeded. Please try again later.`;
|
||||||
|
}
|
||||||
|
} else if (error.message) {
|
||||||
|
errorMessage = error.message;
|
||||||
|
}
|
||||||
|
|
||||||
|
document.querySelector('#error .terminal-alert').textContent = errorMessage;
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
</script>
|
</script>
|
||||||
|
|||||||
@@ -14,6 +14,9 @@ Let's see how we can customize the crawler using hooks! In this example, we'll:
|
|||||||
### Hook Definitions
|
### Hook Definitions
|
||||||
|
|
||||||
```python
|
```python
|
||||||
|
from crawl4ai.web_crawler import WebCrawler
|
||||||
|
from crawl4ai.crawler_strategy import *
|
||||||
|
|
||||||
def on_driver_created(driver):
|
def on_driver_created(driver):
|
||||||
print("[HOOK] on_driver_created")
|
print("[HOOK] on_driver_created")
|
||||||
# Example customization: maximize the window
|
# Example customization: maximize the window
|
||||||
@@ -66,12 +69,13 @@ def before_return_html(driver, html):
|
|||||||
|
|
||||||
```python
|
```python
|
||||||
print("\n🔗 [bold cyan]Using Crawler Hooks: Let's see how we can customize the crawler using hooks![/bold cyan]", True)
|
print("\n🔗 [bold cyan]Using Crawler Hooks: Let's see how we can customize the crawler using hooks![/bold cyan]", True)
|
||||||
crawler = WebCrawler(verbose=True)
|
crawler_strategy = LocalSeleniumCrawlerStrategy(verbose=True)
|
||||||
|
crawler_strategy.set_hook('on_driver_created', on_driver_created)
|
||||||
|
crawler_strategy.set_hook('before_get_url', before_get_url)
|
||||||
|
crawler_strategy.set_hook('after_get_url', after_get_url)
|
||||||
|
crawler_strategy.set_hook('before_return_html', before_return_html)
|
||||||
|
crawler = WebCrawler(verbose=True, crawler_strategy=crawler_strategy)
|
||||||
crawler.warmup()
|
crawler.warmup()
|
||||||
crawler.set_hook('on_driver_created', on_driver_created)
|
|
||||||
crawler.set_hook('before_get_url', before_get_url)
|
|
||||||
crawler.set_hook('after_get_url', after_get_url)
|
|
||||||
crawler.set_hook('before_return_html', before_return_html)
|
|
||||||
|
|
||||||
result = crawler.run(url="https://example.com")
|
result = crawler.run(url="https://example.com")
|
||||||
|
|
||||||
|
|||||||
@@ -45,7 +45,7 @@ model_fees = json.loads(result.extracted_content)
|
|||||||
|
|
||||||
print(len(model_fees))
|
print(len(model_fees))
|
||||||
|
|
||||||
with open(".data/data.json", "w") as f:
|
with open(".data/data.json", "w", encoding="utf-8") as f:
|
||||||
f.write(result.extracted_content)
|
f.write(result.extracted_content)
|
||||||
```
|
```
|
||||||
|
|
||||||
@@ -71,7 +71,7 @@ model_fees = json.loads(result.extracted_content)
|
|||||||
|
|
||||||
print(len(model_fees))
|
print(len(model_fees))
|
||||||
|
|
||||||
with open(".data/data.json", "w") as f:
|
with open(".data/data.json", "w", encoding="utf-8") as f:
|
||||||
f.write(result.extracted_content)
|
f.write(result.extracted_content)
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|||||||
@@ -91,7 +91,7 @@ This example demonstrates how to use `Crawl4AI` to extract a summary from a web
|
|||||||
Save the extracted data to a file for further use.
|
Save the extracted data to a file for further use.
|
||||||
|
|
||||||
```python
|
```python
|
||||||
with open(".data/page_summary.json", "w") as f:
|
with open(".data/page_summary.json", "w", encoding="utf-8") as f:
|
||||||
f.write(result.extracted_content)
|
f.write(result.extracted_content)
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
# Crawl4AI v0.2.73
|
# Crawl4AI v0.2.74
|
||||||
|
|
||||||
Welcome to the official documentation for Crawl4AI! 🕷️🤖 Crawl4AI is an open-source Python library designed to simplify web crawling and extract useful information from web pages. This documentation will guide you through the features, usage, and customization of Crawl4AI.
|
Welcome to the official documentation for Crawl4AI! 🕷️🤖 Crawl4AI is an open-source Python library designed to simplify web crawling and extract useful information from web pages. This documentation will guide you through the features, usage, and customization of Crawl4AI.
|
||||||
|
|
||||||
|
|||||||
@@ -176,41 +176,29 @@ print(f"JavaScript Code (Load More button) result: {result}")
|
|||||||
Let's see how we can customize the crawler using hooks!
|
Let's see how we can customize the crawler using hooks!
|
||||||
|
|
||||||
```python
|
```python
|
||||||
def on_driver_created(driver):
|
import time
|
||||||
print("[HOOK] on_driver_created")
|
|
||||||
driver.maximize_window()
|
|
||||||
driver.get('https://example.com/login')
|
|
||||||
driver.find_element(By.NAME, 'username').send_keys('testuser')
|
|
||||||
driver.find_element(By.NAME, 'password').send_keys('password123')
|
|
||||||
driver.find_element(By.NAME, 'login').click()
|
|
||||||
driver.add_cookie({'name': 'test_cookie', 'value': 'cookie_value'})
|
|
||||||
return driver
|
|
||||||
|
|
||||||
def before_get_url(driver):
|
from crawl4ai.web_crawler import WebCrawler
|
||||||
print("[HOOK] before_get_url")
|
from crawl4ai.crawler_strategy import *
|
||||||
driver.execute_cdp_cmd('Network.enable', {})
|
|
||||||
driver.execute_cdp_cmd('Network.setExtraHTTPHeaders', {'headers': {'X-Test-Header': 'test'}})
|
|
||||||
return driver
|
|
||||||
|
|
||||||
def after_get_url(driver):
|
def delay(driver):
|
||||||
print("[HOOK] after_get_url")
|
print("Delaying for 5 seconds...")
|
||||||
print(driver.current_url)
|
time.sleep(5)
|
||||||
return driver
|
print("Resuming...")
|
||||||
|
|
||||||
|
def create_crawler():
|
||||||
|
crawler_strategy = LocalSeleniumCrawlerStrategy(verbose=True)
|
||||||
|
crawler_strategy.set_hook('after_get_url', delay)
|
||||||
|
crawler = WebCrawler(verbose=True, crawler_strategy=crawler_strategy)
|
||||||
|
crawler.warmup()
|
||||||
|
return crawler
|
||||||
|
|
||||||
def before_return_html(driver, html):
|
crawler = create_crawler()
|
||||||
print("[HOOK] before_return_html")
|
result = crawler.run(url="https://www.nbcnews.com/business", bypass_cache=True)
|
||||||
print(len(html))
|
|
||||||
return driver
|
|
||||||
|
|
||||||
crawler.set_hook('on_driver_created', on_driver_created)
|
|
||||||
crawler.set_hook('before_get_url', before_get_url)
|
|
||||||
crawler.set_hook('after_get_url', after_get_url)
|
|
||||||
crawler.set_hook('before_return_html', before_return_html)
|
|
||||||
|
|
||||||
result = crawler.run(url="https://example.com")
|
|
||||||
print(f"Crawler Hooks result: {result}")
|
|
||||||
```
|
```
|
||||||
|
|
||||||
|
check [Hooks](examples/hooks_auth.md) for more examples.
|
||||||
|
|
||||||
## Congratulations! 🎉
|
## Congratulations! 🎉
|
||||||
|
|
||||||
You've made it through the Crawl4AI Quickstart Guide! Now go forth and crawl the web like a pro! 🕸️
|
You've made it through the Crawl4AI Quickstart Guide! Now go forth and crawl the web like a pro! 🕸️
|
||||||
|
|||||||
83
main.py
83
main.py
@@ -22,6 +22,15 @@ from typing import List, Optional
|
|||||||
from crawl4ai.web_crawler import WebCrawler
|
from crawl4ai.web_crawler import WebCrawler
|
||||||
from crawl4ai.database import get_total_count, clear_db
|
from crawl4ai.database import get_total_count, clear_db
|
||||||
|
|
||||||
|
import time
|
||||||
|
from slowapi import Limiter, _rate_limit_exceeded_handler
|
||||||
|
from slowapi.util import get_remote_address
|
||||||
|
from slowapi.errors import RateLimitExceeded
|
||||||
|
|
||||||
|
# load .env file
|
||||||
|
from dotenv import load_dotenv
|
||||||
|
load_dotenv()
|
||||||
|
|
||||||
# Configuration
|
# Configuration
|
||||||
__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
|
__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
|
||||||
MAX_CONCURRENT_REQUESTS = 10 # Adjust this to change the maximum concurrent requests
|
MAX_CONCURRENT_REQUESTS = 10 # Adjust this to change the maximum concurrent requests
|
||||||
@@ -30,6 +39,78 @@ lock = asyncio.Lock()
|
|||||||
|
|
||||||
app = FastAPI()
|
app = FastAPI()
|
||||||
|
|
||||||
|
# Initialize rate limiter
|
||||||
|
def rate_limit_key_func(request: Request):
|
||||||
|
access_token = request.headers.get("access-token")
|
||||||
|
if access_token == os.environ.get('ACCESS_TOKEN'):
|
||||||
|
return None
|
||||||
|
return get_remote_address(request)
|
||||||
|
|
||||||
|
limiter = Limiter(key_func=rate_limit_key_func)
|
||||||
|
app.state.limiter = limiter
|
||||||
|
|
||||||
|
# Dictionary to store last request times for each client
|
||||||
|
last_request_times = {}
|
||||||
|
last_rate_limit = {}
|
||||||
|
|
||||||
|
|
||||||
|
def get_rate_limit():
|
||||||
|
limit = os.environ.get('ACCESS_PER_MIN', "5")
|
||||||
|
return f"{limit}/minute"
|
||||||
|
|
||||||
|
# Custom rate limit exceeded handler
|
||||||
|
async def custom_rate_limit_exceeded_handler(request: Request, exc: RateLimitExceeded) -> JSONResponse:
|
||||||
|
if request.client.host not in last_rate_limit or time.time() - last_rate_limit[request.client.host] > 60:
|
||||||
|
last_rate_limit[request.client.host] = time.time()
|
||||||
|
retry_after = 60 - (time.time() - last_rate_limit[request.client.host])
|
||||||
|
reset_at = time.time() + retry_after
|
||||||
|
return JSONResponse(
|
||||||
|
status_code=429,
|
||||||
|
content={
|
||||||
|
"detail": "Rate limit exceeded",
|
||||||
|
"limit": str(exc.limit.limit),
|
||||||
|
"retry_after": retry_after,
|
||||||
|
'reset_at': reset_at,
|
||||||
|
"message": f"You have exceeded the rate limit of {exc.limit.limit}."
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
app.add_exception_handler(RateLimitExceeded, custom_rate_limit_exceeded_handler)
|
||||||
|
|
||||||
|
|
||||||
|
# Middleware for token-based bypass and per-request limit
|
||||||
|
class RateLimitMiddleware(BaseHTTPMiddleware):
|
||||||
|
async def dispatch(self, request: Request, call_next):
|
||||||
|
SPAN = int(os.environ.get('ACCESS_TIME_SPAN', 10))
|
||||||
|
access_token = request.headers.get("access-token")
|
||||||
|
if access_token == os.environ.get('ACCESS_TOKEN'):
|
||||||
|
return await call_next(request)
|
||||||
|
|
||||||
|
path = request.url.path
|
||||||
|
if path in ["/crawl", "/old"]:
|
||||||
|
client_ip = request.client.host
|
||||||
|
current_time = time.time()
|
||||||
|
|
||||||
|
# Check time since last request
|
||||||
|
if client_ip in last_request_times:
|
||||||
|
time_since_last_request = current_time - last_request_times[client_ip]
|
||||||
|
if time_since_last_request < SPAN:
|
||||||
|
return JSONResponse(
|
||||||
|
status_code=429,
|
||||||
|
content={
|
||||||
|
"detail": "Too many requests",
|
||||||
|
"message": "Rate limit exceeded. Please wait 10 seconds between requests.",
|
||||||
|
"retry_after": max(0, SPAN - time_since_last_request),
|
||||||
|
"reset_at": current_time + max(0, SPAN - time_since_last_request),
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
last_request_times[client_ip] = current_time
|
||||||
|
|
||||||
|
return await call_next(request)
|
||||||
|
|
||||||
|
app.add_middleware(RateLimitMiddleware)
|
||||||
|
|
||||||
# CORS configuration
|
# CORS configuration
|
||||||
origins = ["*"] # Allow all origins
|
origins = ["*"] # Allow all origins
|
||||||
app.add_middleware(
|
app.add_middleware(
|
||||||
@@ -73,6 +154,7 @@ def read_root():
|
|||||||
return RedirectResponse(url="/mkdocs")
|
return RedirectResponse(url="/mkdocs")
|
||||||
|
|
||||||
@app.get("/old", response_class=HTMLResponse)
|
@app.get("/old", response_class=HTMLResponse)
|
||||||
|
@limiter.limit(get_rate_limit())
|
||||||
async def read_index(request: Request):
|
async def read_index(request: Request):
|
||||||
partials_dir = os.path.join(__location__, "pages", "partial")
|
partials_dir = os.path.join(__location__, "pages", "partial")
|
||||||
partials = {}
|
partials = {}
|
||||||
@@ -107,6 +189,7 @@ def import_strategy(module_name: str, class_name: str, *args, **kwargs):
|
|||||||
raise HTTPException(status_code=400, detail=f"Class {class_name} not found in {module_name}.")
|
raise HTTPException(status_code=400, detail=f"Class {class_name} not found in {module_name}.")
|
||||||
|
|
||||||
@app.post("/crawl")
|
@app.post("/crawl")
|
||||||
|
@limiter.limit(get_rate_limit())
|
||||||
async def crawl_urls(crawl_request: CrawlRequest, request: Request):
|
async def crawl_urls(crawl_request: CrawlRequest, request: Request):
|
||||||
logging.debug(f"[LOG] Crawl request for URL: {crawl_request.urls}")
|
logging.debug(f"[LOG] Crawl request for URL: {crawl_request.urls}")
|
||||||
global current_requests
|
global current_requests
|
||||||
|
|||||||
0
middlewares.py
Normal file
0
middlewares.py
Normal file
17
setup.py
17
setup.py
@@ -1,13 +1,18 @@
|
|||||||
from setuptools import setup, find_packages
|
from setuptools import setup, find_packages
|
||||||
import os
|
import os
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import subprocess
|
import shutil
|
||||||
from setuptools.command.install import install
|
|
||||||
|
|
||||||
# Create the .crawl4ai folder in the user's home directory if it doesn't exist
|
# Create the .crawl4ai folder in the user's home directory if it doesn't exist
|
||||||
crawl4ai_folder = os.path.join(Path.home(), ".crawl4ai")
|
# If the folder already exists, remove the cache folder
|
||||||
os.makedirs(crawl4ai_folder, exist_ok=True)
|
crawl4ai_folder = Path.home() / ".crawl4ai"
|
||||||
os.makedirs(f"{crawl4ai_folder}/cache", exist_ok=True)
|
cache_folder = crawl4ai_folder / "cache"
|
||||||
|
|
||||||
|
if cache_folder.exists():
|
||||||
|
shutil.rmtree(cache_folder)
|
||||||
|
|
||||||
|
crawl4ai_folder.mkdir(exist_ok=True)
|
||||||
|
cache_folder.mkdir(exist_ok=True)
|
||||||
|
|
||||||
# Read the requirements from requirements.txt
|
# Read the requirements from requirements.txt
|
||||||
with open("requirements.txt") as f:
|
with open("requirements.txt") as f:
|
||||||
@@ -20,7 +25,7 @@ transformer_requirements = [req for req in requirements if req.startswith(("tran
|
|||||||
|
|
||||||
setup(
|
setup(
|
||||||
name="Crawl4AI",
|
name="Crawl4AI",
|
||||||
version="0.2.73",
|
version="0.2.74",
|
||||||
description="🔥🕷️ Crawl4AI: Open-source LLM Friendly Web Crawler & Scrapper",
|
description="🔥🕷️ Crawl4AI: Open-source LLM Friendly Web Crawler & Scrapper",
|
||||||
long_description=open("README.md", encoding="utf-8").read(),
|
long_description=open("README.md", encoding="utf-8").read(),
|
||||||
long_description_content_type="text/markdown",
|
long_description_content_type="text/markdown",
|
||||||
|
|||||||
Reference in New Issue
Block a user