Merge branch 'main' of https://github.com/unclecode/crawl4ai

2024-06-08 08:53:54 +00:00
parent 04808b5dc9 9c34b30723
commit 255bde70c9
6 changed files with 151 additions and 14 deletions
--- a/crawl4ai/database.py
+++ b/crawl4ai/database.py
@@ -20,6 +20,7 @@ def init_db():
            extracted_content TEXT,
            success BOOLEAN,
            media TEXT DEFAULT "{}",
+            link TEXT DEFAULT "{}",
            screenshot TEXT DEFAULT ""
        )
    ''')
@@ -41,12 +42,12 @@ def check_db_path():
    if not DB_PATH:
        raise ValueError("Database path is not set or is empty.")

-def get_cached_url(url: str) -> Optional[Tuple[str, str, str, str, str, bool, str]]:
+def get_cached_url(url: str) -> Optional[Tuple[str, str, str, str, str, str, bool, str]]:
    check_db_path()
    try:
        conn = sqlite3.connect(DB_PATH)
        cursor = conn.cursor()
-        cursor.execute('SELECT url, html, cleaned_html, markdown, extracted_content, success, media, screenshot FROM crawled_data WHERE url = ?', (url,))
+        cursor.execute('SELECT url, html, cleaned_html, markdown, extracted_content, success, media, links, screenshot FROM crawled_data WHERE url = ?', (url,))
        result = cursor.fetchone()
        conn.close()
        return result
@@ -54,23 +55,24 @@ def get_cached_url(url: str) -> Optional[Tuple[str, str, str, str, str, bool, st
        print(f"Error retrieving cached URL: {e}")
        return None

-def cache_url(url: str, html: str, cleaned_html: str, markdown: str, extracted_content: str, success: bool, media : str = "{}", screenshot: str = ""):
+def cache_url(url: str, html: str, cleaned_html: str, markdown: str, extracted_content: str, success: bool, media : str = "{}", links : str = "{}", screenshot: str = ""):
    check_db_path()
    try:
        conn = sqlite3.connect(DB_PATH)
        cursor = conn.cursor()
        cursor.execute('''
-            INSERT INTO crawled_data (url, html, cleaned_html, markdown, extracted_content, success, screenshot)
-            VALUES (?, ?, ?, ?, ?, ?, ?)
+            INSERT INTO crawled_data (url, html, cleaned_html, markdown, extracted_content, success, media, links, screenshot)
+            VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
            ON CONFLICT(url) DO UPDATE SET
                html = excluded.html,
                cleaned_html = excluded.cleaned_html,
                markdown = excluded.markdown,
                extracted_content = excluded.extracted_content,
                success = excluded.success,
-                media = excluded.media,                
+                media = excluded.media,      
+                links = excluded.links,          
                screenshot = excluded.screenshot
-        ''', (url, html, cleaned_html, markdown, extracted_content, success, media, screenshot))
+        ''', (url, html, cleaned_html, markdown, extracted_content, success, media, links, screenshot))
        conn.commit()
        conn.close()
    except Exception as e:
@@ -124,5 +126,5 @@ def update_existing_records(new_column: str = "media", default_value: str = "{}"

 if __name__ == "__main__":
    init_db()  # Initialize the database if not already initialized
-    alter_db_add_screenshot()  # Add the new column to the table
-    update_existing_records()  # Update existing records to set the new column to an empty string
+    alter_db_add_screenshot("links")  # Add the new column to the table
+    update_existing_records("links")  # Update existing records to set the new column to an empty string
--- a/crawl4ai/models.py
+++ b/crawl4ai/models.py
@@ -11,6 +11,7 @@ class CrawlResult(BaseModel):
    success: bool
    cleaned_html: Optional[str] = None
    media: Dict[str, List[Dict]] = {}
+    links: Dict[str, List[Dict]] = {}
    screenshot: Optional[str] = None
    markdown: Optional[str] = None
    extracted_content: Optional[str] = None
--- a/crawl4ai/utils.py
+++ b/crawl4ai/utils.py
@@ -151,7 +151,7 @@ class CustomHTML2Text(HTML2Text):

        super().handle_tag(tag, attrs, start)

-def get_content_of_website(html, word_count_threshold = MIN_WORD_THRESHOLD, css_selector = None):
+def get_content_of_website(url, html, word_count_threshold = MIN_WORD_THRESHOLD, css_selector = None):
    try:
        if not html:
            return None
@@ -170,6 +170,28 @@ def get_content_of_website(html, word_count_threshold = MIN_WORD_THRESHOLD, css_
            for el in selected_elements:
                div_tag.append(el)
            body = div_tag
+            
+        links = {
+            'internal': [],
+            'external': []
+        }
+        
+        # Extract all internal and external links
+        for a in body.find_all('a', href=True):
+            href = a['href']
+            url_base = url.split('/')[2]
+            if href.startswith('http') and url_base not in href:
+                links['external'].append({
+                    'href': href,
+                    'text': a.get_text()
+                })
+            else:
+                links['internal'].append(
+                    {
+                        'href': href,
+                        'text': a.get_text()
+                    }
+                )

        # Remove script, style, and other tags that don't carry useful content from body
        for tag in body.find_all(['script', 'style', 'link', 'meta', 'noscript']):
@@ -329,7 +351,8 @@ def get_content_of_website(html, word_count_threshold = MIN_WORD_THRESHOLD, css_
            'markdown': markdown,
            'cleaned_html': cleaned_html,
            'success': True,
-            'media': media
+            'media': media,
+            'links': links
        }

    except Exception as e:
--- a/crawl4ai/web_crawler.py
+++ b/crawl4ai/web_crawler.py
@@ -116,7 +116,8 @@ class WebCrawler:
                        "extracted_content": cached[4],
                        "success": cached[5],
                        "media": json.loads(cached[6] or "{}"),
-                        "screenshot": cached[7],
+                        "links": json.loads(cached[7] or "{}"),
+                        "screenshot": cached[8],
                        "error_message": "",
                    }
                )
@@ -133,15 +134,16 @@ class WebCrawler:
        error_message = ""
        # Extract content from HTML
        try:
-            result = get_content_of_website(html, word_count_threshold, css_selector=css_selector)
+            result = get_content_of_website(url, html, word_count_threshold, css_selector=css_selector)
            if result is None:
                raise ValueError(f"Failed to extract content from the website: {url}")
        except InvalidCSSSelectorError as e:
            raise ValueError(str(e))
        
-        cleaned_html = result.get("cleaned_html", html)
+        cleaned_html = result.get("cleaned_html", "")
        markdown = result.get("markdown", "")
        media = result.get("media", [])
+        links = result.get("links", [])

        # Print a profession LOG style message, show time taken and say crawling is done
        if verbose:
@@ -177,6 +179,7 @@ class WebCrawler:
            extracted_content,
            success,
            json.dumps(media),
+            json.dumps(links),
            screenshot=base64_image,
        )

@@ -186,6 +189,7 @@ class WebCrawler:
            cleaned_html=cleaned_html,
            markdown=markdown,
            media=media,
+            links=links,
            screenshot=base64_image,
            extracted_content=extracted_content,
            success=success,
@@ -229,3 +233,102 @@ class WebCrawler:
            )

        return results
+
+
+    def run_less_db(
+            self,
+            url: str,
+            word_count_threshold=MIN_WORD_THRESHOLD,
+            extraction_strategy: ExtractionStrategy = None,
+            chunking_strategy: ChunkingStrategy = RegexChunking(),
+            bypass_cache: bool = False,
+            css_selector: str = None,
+            screenshot: bool = False,
+            verbose=True,
+            **kwargs,
+        ) -> CrawlResult:
+            extraction_strategy = extraction_strategy or NoExtractionStrategy()
+            extraction_strategy.verbose = verbose
+            if not isinstance(extraction_strategy, ExtractionStrategy):
+                raise ValueError("Unsupported extraction strategy")
+            if not isinstance(chunking_strategy, ChunkingStrategy):
+                raise ValueError("Unsupported chunking strategy")
+            
+            if word_count_threshold < MIN_WORD_THRESHOLD:
+                word_count_threshold = MIN_WORD_THRESHOLD
+
+            # Check cache first
+            cached = None
+            extracted_content = None
+            if not bypass_cache and not self.always_by_pass_cache:
+                cached = get_cached_url(url)
+            
+            if cached:
+                html = cached[1]
+                extracted_content = cached[2]
+            else:
+                html = self.crawler_strategy.crawl(url)
+                cache_url(url, html)
+            
+            return self.process_html(url, html, extracted_content, word_count_threshold, extraction_strategy, chunking_strategy, css_selector, screenshot, verbose, **kwargs)
+
+    def process_html(
+            self,
+            url: str,
+            html: str,
+            extracted_content: str,
+            word_count_threshold: int,
+            extraction_strategy: ExtractionStrategy,
+            chunking_strategy: ChunkingStrategy,
+            css_selector: str,
+            screenshot: bool,
+            verbose: bool,
+            **kwargs,
+        ) -> CrawlResult:
+            t = time.time()
+            base64_image = None
+            if screenshot:
+                base64_image = self.crawler_strategy.take_screenshot()
+
+            # Extract content from HTML
+            try:
+                result = get_content_of_website(url, html, word_count_threshold, css_selector=css_selector)
+                if result is None:
+                    raise ValueError(f"Failed to extract content from the website: {url}")
+            except InvalidCSSSelectorError as e:
+                raise ValueError(str(e))
+            
+            cleaned_html = result.get("cleaned_html", "")
+            markdown = result.get("markdown", "")
+            media = result.get("media", [])
+            links = result.get("links", [])
+
+            if verbose:
+                print(f"[LOG] 🚀 Crawling done for {url}, success: True, time taken: {time.time() - t} seconds")
+
+            if verbose:
+                print(f"[LOG] 🔥 Extracting semantic blocks for {url}, Strategy: {extraction_strategy.name}")
+            
+            sections = chunking_strategy.chunk(markdown)
+            
+            if extracted_content is None:
+                extracted_content = extraction_strategy.run(url, sections)
+                extracted_content = json.dumps(extracted_content)
+                # Cache the extracted content
+                cache_url(url, html, extracted_content)
+
+            if verbose:
+                print(f"[LOG] 🚀 Extraction done for {url}, time taken: {time.time() - t} seconds.")
+
+            return CrawlResult(
+                url=url,
+                html=html,
+                cleaned_html=cleaned_html,
+                markdown=markdown,
+                media=media,
+                links=links,
+                screenshot=base64_image,
+                extracted_content=extracted_content,
+                success=True,
+                error_message="",
+            )
--- a/docs/examples/assets/css_selector.png
+++ b/docs/examples/assets/css_selector.png
--- a/docs/examples/rest_call.py
+++ b/docs/examples/rest_call.py
@@ -8,6 +8,14 @@ data = {
    "screenshot": True,
 }

+# Example of filtering the content using CSS selectors
+# data = {
+#     "urls": [
+#         "https://www.nbcnews.com/business"
+#     ],
+#     "css_selector": "article",
+#     "screenshot": True,
+# }

 # Example of executing a JS script on the page before extracting the content
 # data = {