diff --git a/crawl4ai/database.py b/crawl4ai/database.py index 380973b8..b4734bc6 100644 --- a/crawl4ai/database.py +++ b/crawl4ai/database.py @@ -20,6 +20,7 @@ def init_db(): extracted_content TEXT, success BOOLEAN, media TEXT DEFAULT "{}", + link TEXT DEFAULT "{}", screenshot TEXT DEFAULT "" ) ''') @@ -41,12 +42,12 @@ def check_db_path(): if not DB_PATH: raise ValueError("Database path is not set or is empty.") -def get_cached_url(url: str) -> Optional[Tuple[str, str, str, str, str, bool, str]]: +def get_cached_url(url: str) -> Optional[Tuple[str, str, str, str, str, str, bool, str]]: check_db_path() try: conn = sqlite3.connect(DB_PATH) cursor = conn.cursor() - cursor.execute('SELECT url, html, cleaned_html, markdown, extracted_content, success, media, screenshot FROM crawled_data WHERE url = ?', (url,)) + cursor.execute('SELECT url, html, cleaned_html, markdown, extracted_content, success, media, links, screenshot FROM crawled_data WHERE url = ?', (url,)) result = cursor.fetchone() conn.close() return result @@ -54,23 +55,24 @@ def get_cached_url(url: str) -> Optional[Tuple[str, str, str, str, str, bool, st print(f"Error retrieving cached URL: {e}") return None -def cache_url(url: str, html: str, cleaned_html: str, markdown: str, extracted_content: str, success: bool, media : str = "{}", screenshot: str = ""): +def cache_url(url: str, html: str, cleaned_html: str, markdown: str, extracted_content: str, success: bool, media : str = "{}", links : str = "{}", screenshot: str = ""): check_db_path() try: conn = sqlite3.connect(DB_PATH) cursor = conn.cursor() cursor.execute(''' - INSERT INTO crawled_data (url, html, cleaned_html, markdown, extracted_content, success, screenshot) - VALUES (?, ?, ?, ?, ?, ?, ?) + INSERT INTO crawled_data (url, html, cleaned_html, markdown, extracted_content, success, media, links, screenshot) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?) ON CONFLICT(url) DO UPDATE SET html = excluded.html, cleaned_html = excluded.cleaned_html, markdown = excluded.markdown, extracted_content = excluded.extracted_content, success = excluded.success, - media = excluded.media, + media = excluded.media, + links = excluded.links, screenshot = excluded.screenshot - ''', (url, html, cleaned_html, markdown, extracted_content, success, media, screenshot)) + ''', (url, html, cleaned_html, markdown, extracted_content, success, media, links, screenshot)) conn.commit() conn.close() except Exception as e: @@ -124,5 +126,5 @@ def update_existing_records(new_column: str = "media", default_value: str = "{}" if __name__ == "__main__": init_db() # Initialize the database if not already initialized - alter_db_add_screenshot() # Add the new column to the table - update_existing_records() # Update existing records to set the new column to an empty string + alter_db_add_screenshot("links") # Add the new column to the table + update_existing_records("links") # Update existing records to set the new column to an empty string diff --git a/crawl4ai/models.py b/crawl4ai/models.py index 4a21579c..f844b23c 100644 --- a/crawl4ai/models.py +++ b/crawl4ai/models.py @@ -11,6 +11,7 @@ class CrawlResult(BaseModel): success: bool cleaned_html: Optional[str] = None media: Dict[str, List[Dict]] = {} + links: Dict[str, List[Dict]] = {} screenshot: Optional[str] = None markdown: Optional[str] = None extracted_content: Optional[str] = None diff --git a/crawl4ai/utils.py b/crawl4ai/utils.py index cd6f7c93..c931c865 100644 --- a/crawl4ai/utils.py +++ b/crawl4ai/utils.py @@ -151,7 +151,7 @@ class CustomHTML2Text(HTML2Text): super().handle_tag(tag, attrs, start) -def get_content_of_website(html, word_count_threshold = MIN_WORD_THRESHOLD, css_selector = None): +def get_content_of_website(url, html, word_count_threshold = MIN_WORD_THRESHOLD, css_selector = None): try: if not html: return None @@ -170,6 +170,28 @@ def get_content_of_website(html, word_count_threshold = MIN_WORD_THRESHOLD, css_ for el in selected_elements: div_tag.append(el) body = div_tag + + links = { + 'internal': [], + 'external': [] + } + + # Extract all internal and external links + for a in body.find_all('a', href=True): + href = a['href'] + url_base = url.split('/')[2] + if href.startswith('http') and url_base not in href: + links['external'].append({ + 'href': href, + 'text': a.get_text() + }) + else: + links['internal'].append( + { + 'href': href, + 'text': a.get_text() + } + ) # Remove script, style, and other tags that don't carry useful content from body for tag in body.find_all(['script', 'style', 'link', 'meta', 'noscript']): @@ -329,7 +351,8 @@ def get_content_of_website(html, word_count_threshold = MIN_WORD_THRESHOLD, css_ 'markdown': markdown, 'cleaned_html': cleaned_html, 'success': True, - 'media': media + 'media': media, + 'links': links } except Exception as e: diff --git a/crawl4ai/web_crawler.py b/crawl4ai/web_crawler.py index 6a6445f8..a89d27e0 100644 --- a/crawl4ai/web_crawler.py +++ b/crawl4ai/web_crawler.py @@ -116,7 +116,8 @@ class WebCrawler: "extracted_content": cached[4], "success": cached[5], "media": json.loads(cached[6] or "{}"), - "screenshot": cached[7], + "links": json.loads(cached[7] or "{}"), + "screenshot": cached[8], "error_message": "", } ) @@ -133,15 +134,16 @@ class WebCrawler: error_message = "" # Extract content from HTML try: - result = get_content_of_website(html, word_count_threshold, css_selector=css_selector) + result = get_content_of_website(url, html, word_count_threshold, css_selector=css_selector) if result is None: raise ValueError(f"Failed to extract content from the website: {url}") except InvalidCSSSelectorError as e: raise ValueError(str(e)) - cleaned_html = result.get("cleaned_html", html) + cleaned_html = result.get("cleaned_html", "") markdown = result.get("markdown", "") media = result.get("media", []) + links = result.get("links", []) # Print a profession LOG style message, show time taken and say crawling is done if verbose: @@ -177,6 +179,7 @@ class WebCrawler: extracted_content, success, json.dumps(media), + json.dumps(links), screenshot=base64_image, ) @@ -186,6 +189,7 @@ class WebCrawler: cleaned_html=cleaned_html, markdown=markdown, media=media, + links=links, screenshot=base64_image, extracted_content=extracted_content, success=success, @@ -229,3 +233,102 @@ class WebCrawler: ) return results + + + def run_less_db( + self, + url: str, + word_count_threshold=MIN_WORD_THRESHOLD, + extraction_strategy: ExtractionStrategy = None, + chunking_strategy: ChunkingStrategy = RegexChunking(), + bypass_cache: bool = False, + css_selector: str = None, + screenshot: bool = False, + verbose=True, + **kwargs, + ) -> CrawlResult: + extraction_strategy = extraction_strategy or NoExtractionStrategy() + extraction_strategy.verbose = verbose + if not isinstance(extraction_strategy, ExtractionStrategy): + raise ValueError("Unsupported extraction strategy") + if not isinstance(chunking_strategy, ChunkingStrategy): + raise ValueError("Unsupported chunking strategy") + + if word_count_threshold < MIN_WORD_THRESHOLD: + word_count_threshold = MIN_WORD_THRESHOLD + + # Check cache first + cached = None + extracted_content = None + if not bypass_cache and not self.always_by_pass_cache: + cached = get_cached_url(url) + + if cached: + html = cached[1] + extracted_content = cached[2] + else: + html = self.crawler_strategy.crawl(url) + cache_url(url, html) + + return self.process_html(url, html, extracted_content, word_count_threshold, extraction_strategy, chunking_strategy, css_selector, screenshot, verbose, **kwargs) + + def process_html( + self, + url: str, + html: str, + extracted_content: str, + word_count_threshold: int, + extraction_strategy: ExtractionStrategy, + chunking_strategy: ChunkingStrategy, + css_selector: str, + screenshot: bool, + verbose: bool, + **kwargs, + ) -> CrawlResult: + t = time.time() + base64_image = None + if screenshot: + base64_image = self.crawler_strategy.take_screenshot() + + # Extract content from HTML + try: + result = get_content_of_website(url, html, word_count_threshold, css_selector=css_selector) + if result is None: + raise ValueError(f"Failed to extract content from the website: {url}") + except InvalidCSSSelectorError as e: + raise ValueError(str(e)) + + cleaned_html = result.get("cleaned_html", "") + markdown = result.get("markdown", "") + media = result.get("media", []) + links = result.get("links", []) + + if verbose: + print(f"[LOG] 🚀 Crawling done for {url}, success: True, time taken: {time.time() - t} seconds") + + if verbose: + print(f"[LOG] 🔥 Extracting semantic blocks for {url}, Strategy: {extraction_strategy.name}") + + sections = chunking_strategy.chunk(markdown) + + if extracted_content is None: + extracted_content = extraction_strategy.run(url, sections) + extracted_content = json.dumps(extracted_content) + # Cache the extracted content + cache_url(url, html, extracted_content) + + if verbose: + print(f"[LOG] 🚀 Extraction done for {url}, time taken: {time.time() - t} seconds.") + + return CrawlResult( + url=url, + html=html, + cleaned_html=cleaned_html, + markdown=markdown, + media=media, + links=links, + screenshot=base64_image, + extracted_content=extracted_content, + success=True, + error_message="", + ) \ No newline at end of file diff --git a/docs/examples/assets/css_selector.png b/docs/examples/assets/css_selector.png new file mode 100644 index 00000000..39357bb9 Binary files /dev/null and b/docs/examples/assets/css_selector.png differ diff --git a/docs/examples/rest_call.py b/docs/examples/rest_call.py index 0dd39350..9e74ab47 100644 --- a/docs/examples/rest_call.py +++ b/docs/examples/rest_call.py @@ -8,6 +8,14 @@ data = { "screenshot": True, } +# Example of filtering the content using CSS selectors +# data = { +# "urls": [ +# "https://www.nbcnews.com/business" +# ], +# "css_selector": "article", +# "screenshot": True, +# } # Example of executing a JS script on the page before extracting the content # data = {