diff --git a/CHANGELOG.md b/CHANGELOG.md index 5ddad421..d4504e40 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1 +1,4 @@ -# Changelog \ No newline at end of file +# Changelog + +## TODO: +- User agent: "user_agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.101.76 Safari/537.36", \ No newline at end of file diff --git a/README.md b/README.md index 079382e2..10d81ed9 100644 --- a/README.md +++ b/README.md @@ -14,6 +14,9 @@ Crawl4AI has one clear task: to simplify crawling and extract useful information ### v0.2.3 - 🎨 Extract and return all media tags (Images, Audio, and Video). Check `result.media` +- 🔗 Extrat all external and internal links. Check `result.links` +- 📚 Extract metadata from the page. Check `result.metadata` +- 🕵️ Support `user_agent` parameter to set the user agent for the HTTP requests. - 🖼️ Take [screenshots](#taking-screenshots) of the page. ### v0.2.2 @@ -32,7 +35,7 @@ Crawl4AI has one clear task: to simplify crawling and extract useful information ## Power and Simplicity of Crawl4AI 🚀 -The most easy way! If you don't want to install any library, you can use the REST API on my server. But remember, this is just a simple server. I may improve its capacity if I see there is demand. +The most easy way! If you don't want to install any library, you can use the REST API on my server. But remember, this is just a simple server. I may improve its capacity if I see there is demand. You can find ll examples of REST API in this colab notebook. [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1zODYjhemJ5bUmYceWpVoBMVpd0ofzNBZ?usp=sharing) ```python import requests @@ -41,7 +44,6 @@ data = { "urls": [ "https://www.nbcnews.com/business" ], - "word_count_threshold": 5, "screenshot": True } @@ -242,8 +244,12 @@ To use the REST API, send a POST request to `http://localhost:8000/crawl` with t "url": "https://www.nbcnews.com/business", "extracted_content": "...", "html": "...", + "cleaned_html": "...", "markdown": "...", - "metadata": {...} + "media": {...}, + "links": {...}, + "metadata": {...}, + "screenshots": "...", } ] } @@ -282,6 +288,24 @@ Crawl result without raw HTML content: result = crawler.run(url="https://www.nbcnews.com/business", include_raw_html=False) ``` +### Result Structure + +The result object contains the following fields: +```python +class CrawlResult(BaseModel): + url: str + html: str + success: bool + cleaned_html: Optional[str] = None + media: Dict[str, List[Dict]] = {} # Media tags in the page {"images": [], "audio": [], "video": []} + links: Dict[str, List[Dict]] = {} # Links in the page {"external": [], "internal": []} + screenshot: Optional[str] = None # Base64 encoded screenshot + markdown: Optional[str] = None + extracted_content: Optional[str] = None + metadata: Optional[dict] = None + error_message: Optional[str] = None +``` + ### Taking Screenshots ```python @@ -401,6 +425,7 @@ result = crawler.run(url="https://www.nbcnews.com/business") | `extraction_strategy` | The strategy to use for extracting content from the HTML (e.g., "CosineStrategy"). | No | `NoExtractionStrategy` | | `chunking_strategy` | The strategy to use for chunking the text before processing (e.g., "RegexChunking"). | No | `RegexChunking` | | `css_selector` | The CSS selector to target specific parts of the HTML for extraction. | No | `None` | +| `user_agent` | The user agent to use for the HTTP requests. | No | `Mozilla/5.0` | | `verbose` | Whether to enable verbose logging. | No | `true` | ## Chunking Strategies 📚 diff --git a/crawl4ai/crawler_strategy.py b/crawl4ai/crawler_strategy.py index b85055a5..1f258613 100644 --- a/crawl4ai/crawler_strategy.py +++ b/crawl4ai/crawler_strategy.py @@ -44,6 +44,10 @@ class CrawlerStrategy(ABC): @abstractmethod def take_screenshot(self, save_path: str): pass + + @abstractmethod + def update_user_agent(self, user_agent: str): + pass class CloudCrawlerStrategy(CrawlerStrategy): def __init__(self, use_cached_html = False): @@ -69,6 +73,8 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy): print("[LOG] 🚀 Initializing LocalSeleniumCrawlerStrategy") self.options = Options() self.options.headless = True + if kwargs.get("user_agent"): + self.options.add_argument("--user-agent=" + kwargs.get("user_agent")) self.options.add_argument("--no-sandbox") self.options.add_argument("--headless") # self.options.add_argument("--disable-dev-shm-usage") @@ -97,6 +103,11 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy): self.service.log_path = "NUL" self.driver = webdriver.Chrome(service=self.service, options=self.options) + def update_user_agent(self, user_agent: str): + self.options.add_argument(f"user-agent={user_agent}") + self.driver.quit() + self.driver = webdriver.Chrome(service=self.service, options=self.options) + def crawl(self, url: str) -> str: if self.use_cached_html: cache_file_path = os.path.join(Path.home(), ".crawl4ai", "cache", url.replace("/", "_")) diff --git a/crawl4ai/database.py b/crawl4ai/database.py index b4734bc6..47f41748 100644 --- a/crawl4ai/database.py +++ b/crawl4ai/database.py @@ -21,6 +21,7 @@ def init_db(): success BOOLEAN, media TEXT DEFAULT "{}", link TEXT DEFAULT "{}", + metadata TEXT DEFAULT "{}", screenshot TEXT DEFAULT "" ) ''') @@ -42,12 +43,12 @@ def check_db_path(): if not DB_PATH: raise ValueError("Database path is not set or is empty.") -def get_cached_url(url: str) -> Optional[Tuple[str, str, str, str, str, str, bool, str]]: +def get_cached_url(url: str) -> Optional[Tuple[str, str, str, str, str, str, str, bool, str]]: check_db_path() try: conn = sqlite3.connect(DB_PATH) cursor = conn.cursor() - cursor.execute('SELECT url, html, cleaned_html, markdown, extracted_content, success, media, links, screenshot FROM crawled_data WHERE url = ?', (url,)) + cursor.execute('SELECT url, html, cleaned_html, markdown, extracted_content, success, media, links, metadata, screenshot FROM crawled_data WHERE url = ?', (url,)) result = cursor.fetchone() conn.close() return result @@ -55,14 +56,14 @@ def get_cached_url(url: str) -> Optional[Tuple[str, str, str, str, str, str, boo print(f"Error retrieving cached URL: {e}") return None -def cache_url(url: str, html: str, cleaned_html: str, markdown: str, extracted_content: str, success: bool, media : str = "{}", links : str = "{}", screenshot: str = ""): +def cache_url(url: str, html: str, cleaned_html: str, markdown: str, extracted_content: str, success: bool, media : str = "{}", links : str = "{}", metadata : str = "{}", screenshot: str = ""): check_db_path() try: conn = sqlite3.connect(DB_PATH) cursor = conn.cursor() cursor.execute(''' - INSERT INTO crawled_data (url, html, cleaned_html, markdown, extracted_content, success, media, links, screenshot) - VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?) + INSERT INTO crawled_data (url, html, cleaned_html, markdown, extracted_content, success, media, links, metadata, screenshot) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?) ON CONFLICT(url) DO UPDATE SET html = excluded.html, cleaned_html = excluded.cleaned_html, @@ -70,9 +71,10 @@ def cache_url(url: str, html: str, cleaned_html: str, markdown: str, extracted_c extracted_content = excluded.extracted_content, success = excluded.success, media = excluded.media, - links = excluded.links, + links = excluded.links, + metadata = excluded.metadata, screenshot = excluded.screenshot - ''', (url, html, cleaned_html, markdown, extracted_content, success, media, links, screenshot)) + ''', (url, html, cleaned_html, markdown, extracted_content, success, media, links, metadata, screenshot)) conn.commit() conn.close() except Exception as e: @@ -126,5 +128,5 @@ def update_existing_records(new_column: str = "media", default_value: str = "{}" if __name__ == "__main__": init_db() # Initialize the database if not already initialized - alter_db_add_screenshot("links") # Add the new column to the table - update_existing_records("links") # Update existing records to set the new column to an empty string + alter_db_add_screenshot("metadata") # Add the new column to the table + update_existing_records("metadata") # Update existing records to set the new column to an empty string diff --git a/crawl4ai/utils.py b/crawl4ai/utils.py index c931c865..f201ba0b 100644 --- a/crawl4ai/utils.py +++ b/crawl4ai/utils.py @@ -359,6 +359,47 @@ def get_content_of_website(url, html, word_count_threshold = MIN_WORD_THRESHOLD, print('Error processing HTML content:', str(e)) raise InvalidCSSSelectorError(f"Invalid CSS selector: {css_selector}") from e + + +def extract_metadata(html): + metadata = {} + + if not html: + return metadata + + # Parse HTML content with BeautifulSoup + soup = BeautifulSoup(html, 'html.parser') + + # Title + title_tag = soup.find('title') + metadata['title'] = title_tag.string if title_tag else None + + # Meta description + description_tag = soup.find('meta', attrs={'name': 'description'}) + metadata['description'] = description_tag['content'] if description_tag else None + + # Meta keywords + keywords_tag = soup.find('meta', attrs={'name': 'keywords'}) + metadata['keywords'] = keywords_tag['content'] if keywords_tag else None + + # Meta author + author_tag = soup.find('meta', attrs={'name': 'author'}) + metadata['author'] = author_tag['content'] if author_tag else None + + # Open Graph metadata + og_tags = soup.find_all('meta', attrs={'property': lambda value: value and value.startswith('og:')}) + for tag in og_tags: + property_name = tag['property'] + metadata[property_name] = tag['content'] + + # Twitter Card metadata + twitter_tags = soup.find_all('meta', attrs={'name': lambda value: value and value.startswith('twitter:')}) + for tag in twitter_tags: + property_name = tag['name'] + metadata[property_name] = tag['content'] + + return metadata + def extract_xml_tags(string): tags = re.findall(r'<(\w+)>', string) return list(set(tags)) diff --git a/crawl4ai/web_crawler.py b/crawl4ai/web_crawler.py index a89d27e0..0286c0cf 100644 --- a/crawl4ai/web_crawler.py +++ b/crawl4ai/web_crawler.py @@ -89,8 +89,11 @@ class WebCrawler: css_selector: str = None, screenshot: bool = False, verbose=True, + user_agent: str = None, **kwargs, ) -> CrawlResult: + if user_agent: + self.crawler_strategy.update_user_agent(user_agent) extraction_strategy = extraction_strategy or NoExtractionStrategy() extraction_strategy.verbose = verbose # Check if extraction strategy is an instance of ExtractionStrategy if not raise an error @@ -117,7 +120,8 @@ class WebCrawler: "success": cached[5], "media": json.loads(cached[6] or "{}"), "links": json.loads(cached[7] or "{}"), - "screenshot": cached[8], + "metadata": json.loads(cached[8] or "{}"), # "metadata": "{} + "screenshot": cached[9], "error_message": "", } ) @@ -135,6 +139,7 @@ class WebCrawler: # Extract content from HTML try: result = get_content_of_website(url, html, word_count_threshold, css_selector=css_selector) + metadata = extract_metadata(html) if result is None: raise ValueError(f"Failed to extract content from the website: {url}") except InvalidCSSSelectorError as e: @@ -180,6 +185,7 @@ class WebCrawler: success, json.dumps(media), json.dumps(links), + json.dumps(metadata), screenshot=base64_image, ) @@ -190,6 +196,7 @@ class WebCrawler: markdown=markdown, media=media, links=links, + metadata=metadata, screenshot=base64_image, extracted_content=extracted_content, success=success, diff --git a/docs/examples/assets/basic.png b/docs/examples/assets/basic.png index 2506c639..ea68852b 100644 Binary files a/docs/examples/assets/basic.png and b/docs/examples/assets/basic.png differ diff --git a/docs/examples/assets/css_js.png b/docs/examples/assets/css_js.png new file mode 100644 index 00000000..9c0d2e60 Binary files /dev/null and b/docs/examples/assets/css_js.png differ diff --git a/docs/examples/assets/semantic_extraction_cosine.png b/docs/examples/assets/semantic_extraction_cosine.png new file mode 100644 index 00000000..eace4cf5 Binary files /dev/null and b/docs/examples/assets/semantic_extraction_cosine.png differ diff --git a/docs/examples/assets/semantic_extraction_llm.png b/docs/examples/assets/semantic_extraction_llm.png new file mode 100644 index 00000000..1dba8bc6 Binary files /dev/null and b/docs/examples/assets/semantic_extraction_llm.png differ diff --git a/docs/examples/rest_call.py b/docs/examples/rest_call.py index 9e74ab47..465c6114 100644 --- a/docs/examples/rest_call.py +++ b/docs/examples/rest_call.py @@ -1,75 +1,64 @@ import requests, base64, os +data = { + "urls": ["https://www.nbcnews.com/business"], + "screenshot": True, +} + +response = requests.post("https://crawl4ai.com/crawl", json=data) +result = response.json()['results'][0] +print(result.keys()) +# dict_keys(['url', 'html', 'success', 'cleaned_html', 'media', +# 'links', 'screenshot', 'markdown', 'extracted_content', +# 'metadata', 'error_message']) +with open("screenshot.png", "wb") as f: + f.write(base64.b64decode(result['screenshot'])) + +# Example of filtering the content using CSS selectors +data = { + "urls": [ + "https://www.nbcnews.com/business" + ], + "css_selector": "article", + "screenshot": True, +} + +# Example of executing a JS script on the page before extracting the content data = { "urls": [ "https://www.nbcnews.com/business" ], "screenshot": True, + 'js' : [""" + const loadMoreButton = Array.from(document.querySelectorAll('button')). + find(button => button.textContent.includes('Load More')); + loadMoreButton && loadMoreButton.click(); + """] } -# Example of filtering the content using CSS selectors -# data = { -# "urls": [ -# "https://www.nbcnews.com/business" -# ], -# "css_selector": "article", -# "screenshot": True, -# } - -# Example of executing a JS script on the page before extracting the content -# data = { -# "urls": [ -# "https://www.nbcnews.com/business" -# ], -# "screenshot": True, -# 'js' : [""" -# const loadMoreButton = Array.from(document.querySelectorAll('button')). -# find(button => button.textContent.includes('Load More')); -# loadMoreButton && loadMoreButton.click(); -# """] -# } - # Example of using a custom extraction strategy -# data = { -# "urls": [ -# "https://www.nbcnews.com/business" -# ], -# "extraction_strategy": "CosineStrategy", -# "extraction_strategy_args": { -# "semantic_filter": "inflation rent prices" -# }, -# } +data = { + "urls": [ + "https://www.nbcnews.com/business" + ], + "extraction_strategy": "CosineStrategy", + "extraction_strategy_args": { + "semantic_filter": "inflation rent prices" + }, +} # Example of using LLM to extract content -# data = { -# "urls": [ -# "https://www.nbcnews.com/business" -# ], -# "extraction_strategy": "LLMExtractionStrategy", -# "extraction_strategy_args": { -# "provider": "groq/llama3-8b-8192", -# "api_token": os.environ.get("GROQ_API_KEY"), -# "instruction": """I am interested in only financial news, -# and translate them in French.""" -# }, -# } - -response = requests.post("https://crawl4ai.com/crawl", json=data) -result = response.json()['results'][0] - -print(result['markdown']) -print(result['cleaned_html']) -print(result['media']) -print(result['extracted_content']) -with open("screenshot.png", "wb") as f: - f.write(base64.b64decode(result['screenshot'])) - - - - - - - - +data = { + "urls": [ + "https://www.nbcnews.com/business" + ], + "extraction_strategy": "LLMExtractionStrategy", + "extraction_strategy_args": { + "provider": "groq/llama3-8b-8192", + "api_token": os.environ.get("GROQ_API_KEY"), + "instruction": """I am interested in only financial news, + and translate them in French.""" + }, +} diff --git a/main.py b/main.py index b3196770..7f605bd2 100644 --- a/main.py +++ b/main.py @@ -57,6 +57,7 @@ class CrawlRequest(BaseModel): chunking_strategy_args: Optional[dict] = {} css_selector: Optional[str] = None screenshot: Optional[bool] = False + user_agent: Optional[str] = None verbose: Optional[bool] = True @@ -127,6 +128,7 @@ async def crawl_urls(crawl_request: CrawlRequest, request: Request): crawl_request.bypass_cache, crawl_request.css_selector, crawl_request.screenshot, + crawl_request.user_agent, crawl_request.verbose ) for url in crawl_request.urls