- User agent

- Extract Links
- Extract Metadata
- Update Readme
- Update REST API document
This commit is contained in:
unclecode
2024-06-08 17:59:42 +08:00
parent 9c34b30723
commit b3a0edaa6d
12 changed files with 155 additions and 75 deletions

View File

@@ -1 +1,4 @@
# Changelog # Changelog
## TODO:
- User agent: "user_agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.101.76 Safari/537.36",

View File

@@ -14,6 +14,9 @@ Crawl4AI has one clear task: to simplify crawling and extract useful information
### v0.2.3 ### v0.2.3
- 🎨 Extract and return all media tags (Images, Audio, and Video). Check `result.media` - 🎨 Extract and return all media tags (Images, Audio, and Video). Check `result.media`
- 🔗 Extrat all external and internal links. Check `result.links`
- 📚 Extract metadata from the page. Check `result.metadata`
- 🕵️ Support `user_agent` parameter to set the user agent for the HTTP requests.
- 🖼️ Take [screenshots](#taking-screenshots) of the page. - 🖼️ Take [screenshots](#taking-screenshots) of the page.
### v0.2.2 ### v0.2.2
@@ -32,7 +35,7 @@ Crawl4AI has one clear task: to simplify crawling and extract useful information
## Power and Simplicity of Crawl4AI 🚀 ## Power and Simplicity of Crawl4AI 🚀
The most easy way! If you don't want to install any library, you can use the REST API on my server. But remember, this is just a simple server. I may improve its capacity if I see there is demand. The most easy way! If you don't want to install any library, you can use the REST API on my server. But remember, this is just a simple server. I may improve its capacity if I see there is demand. You can find ll examples of REST API in this colab notebook. [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1zODYjhemJ5bUmYceWpVoBMVpd0ofzNBZ?usp=sharing)
```python ```python
import requests import requests
@@ -41,7 +44,6 @@ data = {
"urls": [ "urls": [
"https://www.nbcnews.com/business" "https://www.nbcnews.com/business"
], ],
"word_count_threshold": 5,
"screenshot": True "screenshot": True
} }
@@ -242,8 +244,12 @@ To use the REST API, send a POST request to `http://localhost:8000/crawl` with t
"url": "https://www.nbcnews.com/business", "url": "https://www.nbcnews.com/business",
"extracted_content": "...", "extracted_content": "...",
"html": "...", "html": "...",
"cleaned_html": "...",
"markdown": "...", "markdown": "...",
"metadata": {...} "media": {...},
"links": {...},
"metadata": {...},
"screenshots": "...",
} }
] ]
} }
@@ -282,6 +288,24 @@ Crawl result without raw HTML content:
result = crawler.run(url="https://www.nbcnews.com/business", include_raw_html=False) result = crawler.run(url="https://www.nbcnews.com/business", include_raw_html=False)
``` ```
### Result Structure
The result object contains the following fields:
```python
class CrawlResult(BaseModel):
url: str
html: str
success: bool
cleaned_html: Optional[str] = None
media: Dict[str, List[Dict]] = {} # Media tags in the page {"images": [], "audio": [], "video": []}
links: Dict[str, List[Dict]] = {} # Links in the page {"external": [], "internal": []}
screenshot: Optional[str] = None # Base64 encoded screenshot
markdown: Optional[str] = None
extracted_content: Optional[str] = None
metadata: Optional[dict] = None
error_message: Optional[str] = None
```
### Taking Screenshots ### Taking Screenshots
```python ```python
@@ -401,6 +425,7 @@ result = crawler.run(url="https://www.nbcnews.com/business")
| `extraction_strategy` | The strategy to use for extracting content from the HTML (e.g., "CosineStrategy"). | No | `NoExtractionStrategy` | | `extraction_strategy` | The strategy to use for extracting content from the HTML (e.g., "CosineStrategy"). | No | `NoExtractionStrategy` |
| `chunking_strategy` | The strategy to use for chunking the text before processing (e.g., "RegexChunking"). | No | `RegexChunking` | | `chunking_strategy` | The strategy to use for chunking the text before processing (e.g., "RegexChunking"). | No | `RegexChunking` |
| `css_selector` | The CSS selector to target specific parts of the HTML for extraction. | No | `None` | | `css_selector` | The CSS selector to target specific parts of the HTML for extraction. | No | `None` |
| `user_agent` | The user agent to use for the HTTP requests. | No | `Mozilla/5.0` |
| `verbose` | Whether to enable verbose logging. | No | `true` | | `verbose` | Whether to enable verbose logging. | No | `true` |
## Chunking Strategies 📚 ## Chunking Strategies 📚

View File

@@ -45,6 +45,10 @@ class CrawlerStrategy(ABC):
def take_screenshot(self, save_path: str): def take_screenshot(self, save_path: str):
pass pass
@abstractmethod
def update_user_agent(self, user_agent: str):
pass
class CloudCrawlerStrategy(CrawlerStrategy): class CloudCrawlerStrategy(CrawlerStrategy):
def __init__(self, use_cached_html = False): def __init__(self, use_cached_html = False):
super().__init__() super().__init__()
@@ -69,6 +73,8 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
print("[LOG] 🚀 Initializing LocalSeleniumCrawlerStrategy") print("[LOG] 🚀 Initializing LocalSeleniumCrawlerStrategy")
self.options = Options() self.options = Options()
self.options.headless = True self.options.headless = True
if kwargs.get("user_agent"):
self.options.add_argument("--user-agent=" + kwargs.get("user_agent"))
self.options.add_argument("--no-sandbox") self.options.add_argument("--no-sandbox")
self.options.add_argument("--headless") self.options.add_argument("--headless")
# self.options.add_argument("--disable-dev-shm-usage") # self.options.add_argument("--disable-dev-shm-usage")
@@ -97,6 +103,11 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
self.service.log_path = "NUL" self.service.log_path = "NUL"
self.driver = webdriver.Chrome(service=self.service, options=self.options) self.driver = webdriver.Chrome(service=self.service, options=self.options)
def update_user_agent(self, user_agent: str):
self.options.add_argument(f"user-agent={user_agent}")
self.driver.quit()
self.driver = webdriver.Chrome(service=self.service, options=self.options)
def crawl(self, url: str) -> str: def crawl(self, url: str) -> str:
if self.use_cached_html: if self.use_cached_html:
cache_file_path = os.path.join(Path.home(), ".crawl4ai", "cache", url.replace("/", "_")) cache_file_path = os.path.join(Path.home(), ".crawl4ai", "cache", url.replace("/", "_"))

View File

@@ -21,6 +21,7 @@ def init_db():
success BOOLEAN, success BOOLEAN,
media TEXT DEFAULT "{}", media TEXT DEFAULT "{}",
link TEXT DEFAULT "{}", link TEXT DEFAULT "{}",
metadata TEXT DEFAULT "{}",
screenshot TEXT DEFAULT "" screenshot TEXT DEFAULT ""
) )
''') ''')
@@ -42,12 +43,12 @@ def check_db_path():
if not DB_PATH: if not DB_PATH:
raise ValueError("Database path is not set or is empty.") raise ValueError("Database path is not set or is empty.")
def get_cached_url(url: str) -> Optional[Tuple[str, str, str, str, str, str, bool, str]]: def get_cached_url(url: str) -> Optional[Tuple[str, str, str, str, str, str, str, bool, str]]:
check_db_path() check_db_path()
try: try:
conn = sqlite3.connect(DB_PATH) conn = sqlite3.connect(DB_PATH)
cursor = conn.cursor() cursor = conn.cursor()
cursor.execute('SELECT url, html, cleaned_html, markdown, extracted_content, success, media, links, screenshot FROM crawled_data WHERE url = ?', (url,)) cursor.execute('SELECT url, html, cleaned_html, markdown, extracted_content, success, media, links, metadata, screenshot FROM crawled_data WHERE url = ?', (url,))
result = cursor.fetchone() result = cursor.fetchone()
conn.close() conn.close()
return result return result
@@ -55,14 +56,14 @@ def get_cached_url(url: str) -> Optional[Tuple[str, str, str, str, str, str, boo
print(f"Error retrieving cached URL: {e}") print(f"Error retrieving cached URL: {e}")
return None return None
def cache_url(url: str, html: str, cleaned_html: str, markdown: str, extracted_content: str, success: bool, media : str = "{}", links : str = "{}", screenshot: str = ""): def cache_url(url: str, html: str, cleaned_html: str, markdown: str, extracted_content: str, success: bool, media : str = "{}", links : str = "{}", metadata : str = "{}", screenshot: str = ""):
check_db_path() check_db_path()
try: try:
conn = sqlite3.connect(DB_PATH) conn = sqlite3.connect(DB_PATH)
cursor = conn.cursor() cursor = conn.cursor()
cursor.execute(''' cursor.execute('''
INSERT INTO crawled_data (url, html, cleaned_html, markdown, extracted_content, success, media, links, screenshot) INSERT INTO crawled_data (url, html, cleaned_html, markdown, extracted_content, success, media, links, metadata, screenshot)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
ON CONFLICT(url) DO UPDATE SET ON CONFLICT(url) DO UPDATE SET
html = excluded.html, html = excluded.html,
cleaned_html = excluded.cleaned_html, cleaned_html = excluded.cleaned_html,
@@ -71,8 +72,9 @@ def cache_url(url: str, html: str, cleaned_html: str, markdown: str, extracted_c
success = excluded.success, success = excluded.success,
media = excluded.media, media = excluded.media,
links = excluded.links, links = excluded.links,
metadata = excluded.metadata,
screenshot = excluded.screenshot screenshot = excluded.screenshot
''', (url, html, cleaned_html, markdown, extracted_content, success, media, links, screenshot)) ''', (url, html, cleaned_html, markdown, extracted_content, success, media, links, metadata, screenshot))
conn.commit() conn.commit()
conn.close() conn.close()
except Exception as e: except Exception as e:
@@ -126,5 +128,5 @@ def update_existing_records(new_column: str = "media", default_value: str = "{}"
if __name__ == "__main__": if __name__ == "__main__":
init_db() # Initialize the database if not already initialized init_db() # Initialize the database if not already initialized
alter_db_add_screenshot("links") # Add the new column to the table alter_db_add_screenshot("metadata") # Add the new column to the table
update_existing_records("links") # Update existing records to set the new column to an empty string update_existing_records("metadata") # Update existing records to set the new column to an empty string

View File

@@ -359,6 +359,47 @@ def get_content_of_website(url, html, word_count_threshold = MIN_WORD_THRESHOLD,
print('Error processing HTML content:', str(e)) print('Error processing HTML content:', str(e))
raise InvalidCSSSelectorError(f"Invalid CSS selector: {css_selector}") from e raise InvalidCSSSelectorError(f"Invalid CSS selector: {css_selector}") from e
def extract_metadata(html):
metadata = {}
if not html:
return metadata
# Parse HTML content with BeautifulSoup
soup = BeautifulSoup(html, 'html.parser')
# Title
title_tag = soup.find('title')
metadata['title'] = title_tag.string if title_tag else None
# Meta description
description_tag = soup.find('meta', attrs={'name': 'description'})
metadata['description'] = description_tag['content'] if description_tag else None
# Meta keywords
keywords_tag = soup.find('meta', attrs={'name': 'keywords'})
metadata['keywords'] = keywords_tag['content'] if keywords_tag else None
# Meta author
author_tag = soup.find('meta', attrs={'name': 'author'})
metadata['author'] = author_tag['content'] if author_tag else None
# Open Graph metadata
og_tags = soup.find_all('meta', attrs={'property': lambda value: value and value.startswith('og:')})
for tag in og_tags:
property_name = tag['property']
metadata[property_name] = tag['content']
# Twitter Card metadata
twitter_tags = soup.find_all('meta', attrs={'name': lambda value: value and value.startswith('twitter:')})
for tag in twitter_tags:
property_name = tag['name']
metadata[property_name] = tag['content']
return metadata
def extract_xml_tags(string): def extract_xml_tags(string):
tags = re.findall(r'<(\w+)>', string) tags = re.findall(r'<(\w+)>', string)
return list(set(tags)) return list(set(tags))

View File

@@ -89,8 +89,11 @@ class WebCrawler:
css_selector: str = None, css_selector: str = None,
screenshot: bool = False, screenshot: bool = False,
verbose=True, verbose=True,
user_agent: str = None,
**kwargs, **kwargs,
) -> CrawlResult: ) -> CrawlResult:
if user_agent:
self.crawler_strategy.update_user_agent(user_agent)
extraction_strategy = extraction_strategy or NoExtractionStrategy() extraction_strategy = extraction_strategy or NoExtractionStrategy()
extraction_strategy.verbose = verbose extraction_strategy.verbose = verbose
# Check if extraction strategy is an instance of ExtractionStrategy if not raise an error # Check if extraction strategy is an instance of ExtractionStrategy if not raise an error
@@ -117,7 +120,8 @@ class WebCrawler:
"success": cached[5], "success": cached[5],
"media": json.loads(cached[6] or "{}"), "media": json.loads(cached[6] or "{}"),
"links": json.loads(cached[7] or "{}"), "links": json.loads(cached[7] or "{}"),
"screenshot": cached[8], "metadata": json.loads(cached[8] or "{}"), # "metadata": "{}
"screenshot": cached[9],
"error_message": "", "error_message": "",
} }
) )
@@ -135,6 +139,7 @@ class WebCrawler:
# Extract content from HTML # Extract content from HTML
try: try:
result = get_content_of_website(url, html, word_count_threshold, css_selector=css_selector) result = get_content_of_website(url, html, word_count_threshold, css_selector=css_selector)
metadata = extract_metadata(html)
if result is None: if result is None:
raise ValueError(f"Failed to extract content from the website: {url}") raise ValueError(f"Failed to extract content from the website: {url}")
except InvalidCSSSelectorError as e: except InvalidCSSSelectorError as e:
@@ -180,6 +185,7 @@ class WebCrawler:
success, success,
json.dumps(media), json.dumps(media),
json.dumps(links), json.dumps(links),
json.dumps(metadata),
screenshot=base64_image, screenshot=base64_image,
) )
@@ -190,6 +196,7 @@ class WebCrawler:
markdown=markdown, markdown=markdown,
media=media, media=media,
links=links, links=links,
metadata=metadata,
screenshot=base64_image, screenshot=base64_image,
extracted_content=extracted_content, extracted_content=extracted_content,
success=success, success=success,

Binary file not shown.

Before

Width:  |  Height:  |  Size: 344 KiB

After

Width:  |  Height:  |  Size: 372 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 537 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 419 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 485 KiB

View File

@@ -1,75 +1,64 @@
import requests, base64, os import requests, base64, os
data = {
"urls": ["https://www.nbcnews.com/business"],
"screenshot": True,
}
response = requests.post("https://crawl4ai.com/crawl", json=data)
result = response.json()['results'][0]
print(result.keys())
# dict_keys(['url', 'html', 'success', 'cleaned_html', 'media',
# 'links', 'screenshot', 'markdown', 'extracted_content',
# 'metadata', 'error_message'])
with open("screenshot.png", "wb") as f:
f.write(base64.b64decode(result['screenshot']))
# Example of filtering the content using CSS selectors
data = {
"urls": [
"https://www.nbcnews.com/business"
],
"css_selector": "article",
"screenshot": True,
}
# Example of executing a JS script on the page before extracting the content
data = { data = {
"urls": [ "urls": [
"https://www.nbcnews.com/business" "https://www.nbcnews.com/business"
], ],
"screenshot": True, "screenshot": True,
'js' : ["""
const loadMoreButton = Array.from(document.querySelectorAll('button')).
find(button => button.textContent.includes('Load More'));
loadMoreButton && loadMoreButton.click();
"""]
} }
# Example of filtering the content using CSS selectors
# data = {
# "urls": [
# "https://www.nbcnews.com/business"
# ],
# "css_selector": "article",
# "screenshot": True,
# }
# Example of executing a JS script on the page before extracting the content
# data = {
# "urls": [
# "https://www.nbcnews.com/business"
# ],
# "screenshot": True,
# 'js' : ["""
# const loadMoreButton = Array.from(document.querySelectorAll('button')).
# find(button => button.textContent.includes('Load More'));
# loadMoreButton && loadMoreButton.click();
# """]
# }
# Example of using a custom extraction strategy # Example of using a custom extraction strategy
# data = { data = {
# "urls": [ "urls": [
# "https://www.nbcnews.com/business" "https://www.nbcnews.com/business"
# ], ],
# "extraction_strategy": "CosineStrategy", "extraction_strategy": "CosineStrategy",
# "extraction_strategy_args": { "extraction_strategy_args": {
# "semantic_filter": "inflation rent prices" "semantic_filter": "inflation rent prices"
# }, },
# } }
# Example of using LLM to extract content # Example of using LLM to extract content
# data = { data = {
# "urls": [ "urls": [
# "https://www.nbcnews.com/business" "https://www.nbcnews.com/business"
# ], ],
# "extraction_strategy": "LLMExtractionStrategy", "extraction_strategy": "LLMExtractionStrategy",
# "extraction_strategy_args": { "extraction_strategy_args": {
# "provider": "groq/llama3-8b-8192", "provider": "groq/llama3-8b-8192",
# "api_token": os.environ.get("GROQ_API_KEY"), "api_token": os.environ.get("GROQ_API_KEY"),
# "instruction": """I am interested in only financial news, "instruction": """I am interested in only financial news,
# and translate them in French.""" and translate them in French."""
# }, },
# } }
response = requests.post("https://crawl4ai.com/crawl", json=data)
result = response.json()['results'][0]
print(result['markdown'])
print(result['cleaned_html'])
print(result['media'])
print(result['extracted_content'])
with open("screenshot.png", "wb") as f:
f.write(base64.b64decode(result['screenshot']))

View File

@@ -57,6 +57,7 @@ class CrawlRequest(BaseModel):
chunking_strategy_args: Optional[dict] = {} chunking_strategy_args: Optional[dict] = {}
css_selector: Optional[str] = None css_selector: Optional[str] = None
screenshot: Optional[bool] = False screenshot: Optional[bool] = False
user_agent: Optional[str] = None
verbose: Optional[bool] = True verbose: Optional[bool] = True
@@ -127,6 +128,7 @@ async def crawl_urls(crawl_request: CrawlRequest, request: Request):
crawl_request.bypass_cache, crawl_request.bypass_cache,
crawl_request.css_selector, crawl_request.css_selector,
crawl_request.screenshot, crawl_request.screenshot,
crawl_request.user_agent,
crawl_request.verbose crawl_request.verbose
) )
for url in crawl_request.urls for url in crawl_request.urls