Improve libraries import

2024-05-13 02:46:35 +08:00
parent 11393183f7
commit 5fea6c064b
5 changed files with 231 additions and 125 deletions
--- a/crawl4ai/web_crawler.py
+++ b/crawl4ai/web_crawler.py
@@ -9,49 +9,93 @@ from .extraction_strategy import *
 from .crawler_strategy import *
 from typing import List
 from concurrent.futures import ThreadPoolExecutor
-from .config import * 
+from .config import *
+

 class WebCrawler:
-    def __init__(self, db_path: str, crawler_strategy: CrawlerStrategy = LocalSeleniumCrawlerStrategy()):
+    def __init__(
+        self,
+        db_path: str = None,
+        crawler_strategy: CrawlerStrategy = LocalSeleniumCrawlerStrategy(),
+    ):
        self.db_path = db_path
-        init_db(self.db_path)
        self.crawler_strategy = crawler_strategy
-        
+
        # Create the .crawl4ai folder in the user's home directory if it doesn't exist
        self.crawl4ai_folder = os.path.join(Path.home(), ".crawl4ai")
-        os.makedirs(self.crawl4ai_folder, exist_ok=True)        
+        os.makedirs(self.crawl4ai_folder, exist_ok=True)
        os.makedirs(f"{self.crawl4ai_folder}/cache", exist_ok=True)
-       

-    def fetch_page(self, 
-                   url_model: UrlModel, 
-                   provider: str = DEFAULT_PROVIDER, 
-                   api_token: str = None, 
-                   extract_blocks_flag: bool = True, 
-                   word_count_threshold = MIN_WORD_THRESHOLD,
-                   use_cached_html: bool = False,
-                   extraction_strategy: ExtractionStrategy = LLMExtractionStrategy(),
-                   chunking_strategy: ChunkingStrategy = RegexChunking(),
-                   **kwargs                   
-                   ) -> CrawlResult:
+        # If db_path is not provided, use the default path
+        if not db_path:
+            self.db_path = f"{self.crawl4ai_folder}/crawl4ai.db"
+        
+        init_db(self.db_path)
+        
+        self.ready = False
+        
+    def warmup(self):
+        print("[LOG] 🌤️ Warming up the WebCrawler")
+        single_url = UrlModel(url='https://crawl4ai.uccode.io/', forced=False)
+        result = self.run(
+            single_url, 
+            word_count_threshold=5,
+            extraction_strategy= CosinegStrategy(),
+            verbose = False
+        )
+        self.ready = True
+        print("[LOG] 🌞 WebCrawler is ready to crawl")
+        

+    def fetch_page(
+        self,
+        url_model: UrlModel,
+        provider: str = DEFAULT_PROVIDER,
+        api_token: str = None,
+        extract_blocks_flag: bool = True,
+        word_count_threshold=MIN_WORD_THRESHOLD,
+        use_cached_html: bool = False,
+        extraction_strategy: ExtractionStrategy = LLMExtractionStrategy(),
+        chunking_strategy: ChunkingStrategy = RegexChunking(),
+        **kwargs,
+    ) -> CrawlResult:
+        return self.run(
+            url_model,
+            word_count_threshold,
+            extraction_strategy,
+            chunking_strategy,
+            **kwargs,
+        )
+        pass
+
+
+    def run(
+        self,
+        url_model: UrlModel,
+        word_count_threshold=MIN_WORD_THRESHOLD,
+        extraction_strategy: ExtractionStrategy = NoExtractionStrategy(),
+        chunking_strategy: ChunkingStrategy = RegexChunking(),
+        verbose=True,
+        **kwargs,
+    ) -> CrawlResult:
        # make sure word_count_threshold is not lesser than MIN_WORD_THRESHOLD
        if word_count_threshold < MIN_WORD_THRESHOLD:
            word_count_threshold = MIN_WORD_THRESHOLD
-            
+
        # Check cache first
        cached = get_cached_url(self.db_path, str(url_model.url))
        if cached and not url_model.forced:
-            return CrawlResult(**{
-                "url": cached[0],
-                "html": cached[1],
-                "cleaned_html": cached[2],
-                "markdown": cached[3],
-                "parsed_json": cached[4],
-                "success": cached[5],
-                "error_message": ""
-            })
-            
+            return CrawlResult(
+                **{
+                    "url": cached[0],
+                    "html": cached[1],
+                    "cleaned_html": cached[2],
+                    "markdown": cached[3],
+                    "parsed_json": cached[4],
+                    "success": cached[5],
+                    "error_message": "",
+                }
+            )

        # Initialize WebDriver for crawling
        t = time.time()
@@ -62,65 +106,89 @@ class WebCrawler:
        except Exception as e:
            html = ""
            success = False
-            error_message = str(e)        
-        
+            error_message = str(e)
+
        # Extract content from HTML
        result = get_content_of_website(html, word_count_threshold)
-        cleaned_html = result.get('cleaned_html', html)
-        markdown = result.get('markdown', "")
-        
+        cleaned_html = result.get("cleaned_html", html)
+        markdown = result.get("markdown", "")
+
        # Print a profession LOG style message, show time taken and say crawling is done
-        print(f"[LOG] 🚀 Crawling done for {url_model.url}, success: {success}, time taken: {time.time() - t} seconds")
-        
+        if verbose:
+            print(
+                f"[LOG] 🚀 Crawling done for {url_model.url}, success: {success}, time taken: {time.time() - t} seconds"
+            )

        parsed_json = []
-        if extract_blocks_flag:
+        if verbose:
            print(f"[LOG] 🔥 Extracting semantic blocks for {url_model.url}")
-            t = time.time()
-            # Split markdown into sections
-            sections = chunking_strategy.chunk(markdown)                          
-            # sections = merge_chunks_based_on_token_threshold(sections, CHUNK_TOKEN_THRESHOLD)
+        t = time.time()
+        # Split markdown into sections
+        sections = chunking_strategy.chunk(markdown)
+        # sections = merge_chunks_based_on_token_threshold(sections, CHUNK_TOKEN_THRESHOLD)

-            parsed_json = extraction_strategy.run(str(url_model.url), sections, provider, api_token)
-            parsed_json = json.dumps(parsed_json)
-            
-            
-            print(f"[LOG] 🚀 Extraction done for {url_model.url}, time taken: {time.time() - t} seconds.")
-        else:
-            parsed_json = "{}"
-            print(f"[LOG] 🚀 Skipping extraction for {url_model.url}")
+        parsed_json = extraction_strategy.run(
+            str(url_model.url), sections,
+        )
+        parsed_json = json.dumps(parsed_json)
+
+        if verbose:
+            print(
+                f"[LOG] 🚀 Extraction done for {url_model.url}, time taken: {time.time() - t} seconds."
+            )

        # Cache the result
        cleaned_html = beautify_html(cleaned_html)
-        cache_url(self.db_path, str(url_model.url), html, cleaned_html, markdown, parsed_json, success)
-
-        return CrawlResult(
-            url=str(url_model.url), 
-            html=html, 
-            cleaned_html=cleaned_html, 
-            markdown=markdown, 
-            parsed_json=parsed_json, 
-            success=success, 
-            error_message=error_message
+        cache_url(
+            self.db_path,
+            str(url_model.url),
+            html,
+            cleaned_html,
+            markdown,
+            parsed_json,
+            success,
        )

-    def fetch_pages(self, url_models: List[UrlModel], provider: str = DEFAULT_PROVIDER, api_token: str = None, 
-                    extract_blocks_flag: bool = True, word_count_threshold=MIN_WORD_THRESHOLD,
-                    use_cached_html: bool = False, extraction_strategy: ExtractionStrategy = LLMExtractionStrategy(),
-                    chunking_strategy: ChunkingStrategy = RegexChunking(), **kwargs) -> List[CrawlResult]:
-        
+        return CrawlResult(
+            url=str(url_model.url),
+            html=html,
+            cleaned_html=cleaned_html,
+            markdown=markdown,
+            parsed_json=parsed_json,
+            success=success,
+            error_message=error_message,
+        )
+
+    def fetch_pages(
+        self,
+        url_models: List[UrlModel],
+        provider: str = DEFAULT_PROVIDER,
+        api_token: str = None,
+        extract_blocks_flag: bool = True,
+        word_count_threshold=MIN_WORD_THRESHOLD,
+        use_cached_html: bool = False,
+        extraction_strategy: ExtractionStrategy = LLMExtractionStrategy(),
+        chunking_strategy: ChunkingStrategy = RegexChunking(),
+        **kwargs,
+    ) -> List[CrawlResult]:
+
        def fetch_page_wrapper(url_model, *args, **kwargs):
            return self.fetch_page(url_model, *args, **kwargs)

        with ThreadPoolExecutor() as executor:
-            results = list(executor.map(fetch_page_wrapper, url_models, 
-                                        [provider] * len(url_models), 
-                                        [api_token] * len(url_models),
-                                        [extract_blocks_flag] * len(url_models),
-                                        [word_count_threshold] * len(url_models),
-                                        [use_cached_html] * len(url_models),
-                                        [extraction_strategy] * len(url_models),
-                                        [chunking_strategy] * len(url_models),
-                                        *[kwargs] * len(url_models)))
+            results = list(
+                executor.map(
+                    fetch_page_wrapper,
+                    url_models,
+                    [provider] * len(url_models),
+                    [api_token] * len(url_models),
+                    [extract_blocks_flag] * len(url_models),
+                    [word_count_threshold] * len(url_models),
+                    [use_cached_html] * len(url_models),
+                    [extraction_strategy] * len(url_models),
+                    [chunking_strategy] * len(url_models),
+                    *[kwargs] * len(url_models),
+                )
+            )

-        return results
+        return results