Merge branch 'main' of https://github.com/unclecode/crawl4ai

2024-06-08 08:53:54 +00:00
parent 04808b5dc9 9c34b30723
commit 255bde70c9
6 changed files with 151 additions and 14 deletions
--- a/crawl4ai/database.py
+++ b/crawl4ai/database.py
@@ -20,6 +20,7 @@ def init_db():
            extracted_content TEXT,
            success BOOLEAN,
            media TEXT DEFAULT "{}",
            link TEXT DEFAULT "{}",
            screenshot TEXT DEFAULT ""
        )
    ''')
@@ -41,12 +42,12 @@ def check_db_path():
    if not DB_PATH:
        raise ValueError("Database path is not set or is empty.")
-def get_cached_url(url: str) -> Optional[Tuple[str, str, str, str, str, bool, str]]:
+def get_cached_url(url: str) -> Optional[Tuple[str, str, str, str, str, str, bool, str]]:
    check_db_path()
    try:
        conn = sqlite3.connect(DB_PATH)
        cursor = conn.cursor()
-        cursor.execute('SELECT url, html, cleaned_html, markdown, extracted_content, success, media, screenshot FROM crawled_data WHERE url = ?', (url,))
+        cursor.execute('SELECT url, html, cleaned_html, markdown, extracted_content, success, media, links, screenshot FROM crawled_data WHERE url = ?', (url,))
        result = cursor.fetchone()
        conn.close()
        return result
@@ -54,23 +55,24 @@ def get_cached_url(url: str) -> Optional[Tuple[str, str, str, str, str, bool, st
        print(f"Error retrieving cached URL: {e}")
        return None
-def cache_url(url: str, html: str, cleaned_html: str, markdown: str, extracted_content: str, success: bool, media : str = "{}", screenshot: str = ""):
+def cache_url(url: str, html: str, cleaned_html: str, markdown: str, extracted_content: str, success: bool, media : str = "{}", links : str = "{}", screenshot: str = ""):
    check_db_path()
    try:
        conn = sqlite3.connect(DB_PATH)
        cursor = conn.cursor()
        cursor.execute('''
-            INSERT INTO crawled_data (url, html, cleaned_html, markdown, extracted_content, success, screenshot)
+            INSERT INTO crawled_data (url, html, cleaned_html, markdown, extracted_content, success, media, links, screenshot)
-            VALUES (?, ?, ?, ?, ?, ?, ?)
+            VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
            ON CONFLICT(url) DO UPDATE SET
                html = excluded.html,
                cleaned_html = excluded.cleaned_html,
                markdown = excluded.markdown,
                extracted_content = excluded.extracted_content,
                success = excluded.success,
-                media = excluded.media,                
+                media = excluded.media,      
                links = excluded.links,          
                screenshot = excluded.screenshot
-        ''', (url, html, cleaned_html, markdown, extracted_content, success, media, screenshot))
+        ''', (url, html, cleaned_html, markdown, extracted_content, success, media, links, screenshot))
        conn.commit()
        conn.close()
    except Exception as e:
@@ -124,5 +126,5 @@ def update_existing_records(new_column: str = "media", default_value: str = "{}"
 if __name__ == "__main__":
    init_db()  # Initialize the database if not already initialized
-    alter_db_add_screenshot()  # Add the new column to the table
+    alter_db_add_screenshot("links")  # Add the new column to the table
-    update_existing_records()  # Update existing records to set the new column to an empty string
+    update_existing_records("links")  # Update existing records to set the new column to an empty string
--- a/crawl4ai/models.py
+++ b/crawl4ai/models.py
@@ -11,6 +11,7 @@ class CrawlResult(BaseModel):
    success: bool
    cleaned_html: Optional[str] = None
    media: Dict[str, List[Dict]] = {}
    links: Dict[str, List[Dict]] = {}
    screenshot: Optional[str] = None
    markdown: Optional[str] = None
    extracted_content: Optional[str] = None
--- a/crawl4ai/utils.py
+++ b/crawl4ai/utils.py
@@ -151,7 +151,7 @@ class CustomHTML2Text(HTML2Text):
        super().handle_tag(tag, attrs, start)
-def get_content_of_website(html, word_count_threshold = MIN_WORD_THRESHOLD, css_selector = None):
+def get_content_of_website(url, html, word_count_threshold = MIN_WORD_THRESHOLD, css_selector = None):
    try:
        if not html:
            return None
@@ -170,6 +170,28 @@ def get_content_of_website(html, word_count_threshold = MIN_WORD_THRESHOLD, css_
            for el in selected_elements:
                div_tag.append(el)
            body = div_tag
        links = {
            'internal': [],
            'external': []
        }
        # Extract all internal and external links
        for a in body.find_all('a', href=True):
            href = a['href']
            url_base = url.split('/')[2]
            if href.startswith('http') and url_base not in href:
                links['external'].append({
                    'href': href,
                    'text': a.get_text()
                })
            else:
                links['internal'].append(
                    {
                        'href': href,
                        'text': a.get_text()
                    }
                )
        # Remove script, style, and other tags that don't carry useful content from body
        for tag in body.find_all(['script', 'style', 'link', 'meta', 'noscript']):
@@ -329,7 +351,8 @@ def get_content_of_website(html, word_count_threshold = MIN_WORD_THRESHOLD, css_
            'markdown': markdown,
            'cleaned_html': cleaned_html,
            'success': True,
-            'media': media
+            'media': media,
            'links': links
        }
    except Exception as e:
--- a/crawl4ai/web_crawler.py
+++ b/crawl4ai/web_crawler.py
@@ -116,7 +116,8 @@ class WebCrawler:
                        "extracted_content": cached[4],
                        "success": cached[5],
                        "media": json.loads(cached[6] or "{}"),
-                        "screenshot": cached[7],
+                        "links": json.loads(cached[7] or "{}"),
                        "screenshot": cached[8],
                        "error_message": "",
                    }
                )
@@ -133,15 +134,16 @@ class WebCrawler:
        error_message = ""
        # Extract content from HTML
        try:
-            result = get_content_of_website(html, word_count_threshold, css_selector=css_selector)
+            result = get_content_of_website(url, html, word_count_threshold, css_selector=css_selector)
            if result is None:
                raise ValueError(f"Failed to extract content from the website: {url}")
        except InvalidCSSSelectorError as e:
            raise ValueError(str(e))
-        cleaned_html = result.get("cleaned_html", html)
+        cleaned_html = result.get("cleaned_html", "")
        markdown = result.get("markdown", "")
        media = result.get("media", [])
        links = result.get("links", [])
        # Print a profession LOG style message, show time taken and say crawling is done
        if verbose:
@@ -177,6 +179,7 @@ class WebCrawler:
            extracted_content,
            success,
            json.dumps(media),
            json.dumps(links),
            screenshot=base64_image,
        )
@@ -186,6 +189,7 @@ class WebCrawler:
            cleaned_html=cleaned_html,
            markdown=markdown,
            media=media,
            links=links,
            screenshot=base64_image,
            extracted_content=extracted_content,
            success=success,
@@ -229,3 +233,102 @@ class WebCrawler:
            )
        return results
    def run_less_db(
            self,
            url: str,
            word_count_threshold=MIN_WORD_THRESHOLD,
            extraction_strategy: ExtractionStrategy = None,
            chunking_strategy: ChunkingStrategy = RegexChunking(),
            bypass_cache: bool = False,
            css_selector: str = None,
            screenshot: bool = False,
            verbose=True,
            **kwargs,
        ) -> CrawlResult:
            extraction_strategy = extraction_strategy or NoExtractionStrategy()
            extraction_strategy.verbose = verbose
            if not isinstance(extraction_strategy, ExtractionStrategy):
                raise ValueError("Unsupported extraction strategy")
            if not isinstance(chunking_strategy, ChunkingStrategy):
                raise ValueError("Unsupported chunking strategy")
            if word_count_threshold < MIN_WORD_THRESHOLD:
                word_count_threshold = MIN_WORD_THRESHOLD
            # Check cache first
            cached = None
            extracted_content = None
            if not bypass_cache and not self.always_by_pass_cache:
                cached = get_cached_url(url)
            if cached:
                html = cached[1]
                extracted_content = cached[2]
            else:
                html = self.crawler_strategy.crawl(url)
                cache_url(url, html)
            return self.process_html(url, html, extracted_content, word_count_threshold, extraction_strategy, chunking_strategy, css_selector, screenshot, verbose, **kwargs)
    def process_html(
            self,
            url: str,
            html: str,
            extracted_content: str,
            word_count_threshold: int,
            extraction_strategy: ExtractionStrategy,
            chunking_strategy: ChunkingStrategy,
            css_selector: str,
            screenshot: bool,
            verbose: bool,
            **kwargs,
        ) -> CrawlResult:
            t = time.time()
            base64_image = None
            if screenshot:
                base64_image = self.crawler_strategy.take_screenshot()
            # Extract content from HTML
            try:
                result = get_content_of_website(url, html, word_count_threshold, css_selector=css_selector)
                if result is None:
                    raise ValueError(f"Failed to extract content from the website: {url}")
            except InvalidCSSSelectorError as e:
                raise ValueError(str(e))
            cleaned_html = result.get("cleaned_html", "")
            markdown = result.get("markdown", "")
            media = result.get("media", [])
            links = result.get("links", [])
            if verbose:
                print(f"[LOG] 🚀 Crawling done for {url}, success: True, time taken: {time.time() - t} seconds")
            if verbose:
                print(f"[LOG] 🔥 Extracting semantic blocks for {url}, Strategy: {extraction_strategy.name}")
            sections = chunking_strategy.chunk(markdown)
            if extracted_content is None:
                extracted_content = extraction_strategy.run(url, sections)
                extracted_content = json.dumps(extracted_content)
                # Cache the extracted content
                cache_url(url, html, extracted_content)
            if verbose:
                print(f"[LOG] 🚀 Extraction done for {url}, time taken: {time.time() - t} seconds.")
            return CrawlResult(
                url=url,
                html=html,
                cleaned_html=cleaned_html,
                markdown=markdown,
                media=media,
                links=links,
                screenshot=base64_image,
                extracted_content=extracted_content,
                success=True,
                error_message="",
            )
--- a/docs/examples/assets/css_selector.png
+++ b/docs/examples/assets/css_selector.png
--- a/docs/examples/rest_call.py
+++ b/docs/examples/rest_call.py
@@ -8,6 +8,14 @@ data = {
    "screenshot": True,
 }
 # Example of filtering the content using CSS selectors
 # data = {
 #     "urls": [
 #         "https://www.nbcnews.com/business"
 #     ],
 #     "css_selector": "article",
 #     "screenshot": True,
 # }
 # Example of executing a JS script on the page before extracting the content
 # data = {