Update web_crawler.py

Improve code efficiency, readability, and maintainability in web_crawler.py
2024-08-30 15:30:06 +03:00
5 changed files with 13 additions and 36 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -189,6 +189,4 @@ a.txt
 .lambda_function.py
 ec2*
-update_changelog.sh
+update_changelog.sh
 test_env/
 tmp/
--- a/crawl4ai/extraction_strategy.py
+++ b/crawl4ai/extraction_strategy.py
@@ -101,7 +101,7 @@ class LLMExtractionStrategy(ExtractionStrategy):
            variable_values["REQUEST"] = self.instruction
            prompt_with_variables = PROMPT_EXTRACT_BLOCKS_WITH_INSTRUCTION
-        if self.extract_type == "schema" and self.schema:
+        if self.extract_type == "schema":
            variable_values["SCHEMA"] = json.dumps(self.schema, indent=2)
            prompt_with_variables = PROMPT_EXTRACT_SCHEMA_WITH_INSTRUCTION
--- a/crawl4ai/utils.py
+++ b/crawl4ai/utils.py
@@ -834,6 +834,7 @@ def extract_blocks_batch(batch_data, provider = "groq/llama3-70b-8192", api_toke
    return sum(all_blocks, [])
 def merge_chunks_based_on_token_threshold(chunks, token_threshold):
    """
    Merges small chunks into larger ones based on the total token threshold.
@@ -879,6 +880,7 @@ def process_sections(url: str, sections: list, provider: str, api_token: str) ->
    return extracted_content
 def wrap_text(draw, text, font, max_width):
    # Wrap the text to fit within the specified width
    lines = []
@@ -890,6 +892,7 @@ def wrap_text(draw, text, font, max_width):
        lines.append(line)
    return '\n'.join(lines)
 def format_html(html_string):
    soup = BeautifulSoup(html_string, 'html.parser')
    return soup.prettify()
--- a/crawl4ai/web_crawler.py
+++ b/crawl4ai/web_crawler.py
@@ -16,40 +16,23 @@ warnings.filterwarnings("ignore", message='Field "model_name" has conflict with
 class WebCrawler:
-    def __init__(
+    def __init__(self, crawler_strategy: CrawlerStrategy = None, always_by_pass_cache: bool = False, verbose: bool = False):
        self,
        # db_path: str = None,
        crawler_strategy: CrawlerStrategy = None,
        always_by_pass_cache: bool = False,
        verbose: bool = False,
    ):
        # self.db_path = db_path
        self.crawler_strategy = crawler_strategy or LocalSeleniumCrawlerStrategy(verbose=verbose)
        self.always_by_pass_cache = always_by_pass_cache
        # Create the .crawl4ai folder in the user's home directory if it doesn't exist
        self.crawl4ai_folder = os.path.join(Path.home(), ".crawl4ai")
        os.makedirs(self.crawl4ai_folder, exist_ok=True)
        os.makedirs(f"{self.crawl4ai_folder}/cache", exist_ok=True)
        # If db_path is not provided, use the default path
        # if not db_path:
            # self.db_path = f"{self.crawl4ai_folder}/crawl4ai.db"
        # flush_db()
        init_db()
        self.ready = False
    def warmup(self):
        print("[LOG] 🌤️  Warming up the WebCrawler")
-        result = self.run(
+        self.run(
            url='https://google.com/',
            word_count_threshold=5,
-            extraction_strategy= NoExtractionStrategy(),
+            extraction_strategy=NoExtractionStrategy(),
            bypass_cache=False,
-            verbose = False,
+            verbose=False
            # warmup=True
        )
        self.ready = True
        print("[LOG] 🌞 WebCrawler is ready to crawl")
@@ -139,12 +122,8 @@ class WebCrawler:
                if not isinstance(chunking_strategy, ChunkingStrategy):
                    raise ValueError("Unsupported chunking strategy")
                # if word_count_threshold < MIN_WORD_THRESHOLD:
                #     word_count_threshold = MIN_WORD_THRESHOLD
                word_count_threshold = max(word_count_threshold, 0)
                # Check cache first
                cached = None
                screenshot_data = None
                extracted_content = None
@@ -169,7 +148,7 @@ class WebCrawler:
                    html = sanitize_input_encode(self.crawler_strategy.crawl(url, **kwargs))
                    t2 = time.time()
                    if verbose:
-                        print(f"[LOG] 🚀 Crawling done for {url}, success: {bool(html)}, time taken: {t2 - t1} seconds")
+                        print(f"[LOG] 🚀 Crawling done for {url}, success: {bool(html)}, time taken: {t2 - t1:.2f} seconds")
                    if screenshot:
                        screenshot_data = self.crawler_strategy.take_screenshot()
@@ -200,13 +179,10 @@ class WebCrawler:
            t = time.time()
            # Extract content from HTML
            try:
                # t1 = time.time()
                # result = get_content_of_website(url, html, word_count_threshold, css_selector=css_selector, only_text=kwargs.get("only_text", False))
                # print(f"[LOG] 🚀 Crawling done for {url}, success: True, time taken: {time.time() - t1} seconds")
                t1 = time.time()
                result = get_content_of_website_optimized(url, html, word_count_threshold, css_selector=css_selector, only_text=kwargs.get("only_text", False))
                if verbose:
-                    print(f"[LOG] 🚀 Content extracted for {url}, success: True, time taken: {time.time() - t1} seconds")
+                    print(f"[LOG] 🚀 Content extracted for {url}, success: True, time taken: {time.time() - t1:.2f} seconds")
                if result is None:
                    raise ValueError(f"Failed to extract content from the website: {url}")
@@ -228,7 +204,7 @@ class WebCrawler:
                extracted_content = json.dumps(extracted_content, indent=4, default=str)
                if verbose:
-                    print(f"[LOG] 🚀 Extraction done for {url}, time taken: {time.time() - t} seconds.")
+                    print(f"[LOG] 🚀 Extraction done for {url}, time taken: {time.time() - t:.2f} seconds.")
            screenshot = None if not screenshot else screenshot
--- a/setup.py
+++ b/setup.py
@@ -19,7 +19,7 @@ with open("requirements.txt") as f:
    requirements = f.read().splitlines()
 # Define the requirements for different environments
-default_requirements = [req for req in requirements if not req.startswith(("torch", "transformers", "onnxruntime", "nltk", "spacy", "tokenizers", "scikit-learn"))]
+default_requirements = [req for req in requirements if not req.startswith(("torch", "transformers", "onnxruntime", "nltk", "spacy", "tokenizers", "scikit-learn", "numpy"))]
 torch_requirements = [req for req in requirements if req.startswith(("torch", "nltk", "spacy", "scikit-learn", "numpy"))]
 transformer_requirements = [req for req in requirements if req.startswith(("transformers", "tokenizers", "onnxruntime"))]