Merge branch 'main' of https://github.com/unclecode/crawl4ai

2024-06-07 12:44:41 +00:00
parent b3a150f3d1 a19379aa58
commit 04808b5dc9
10 changed files with 79 additions and 24 deletions
--- a/README.md
+++ b/README.md
@@ -95,20 +95,17 @@ from crawl4ai.extraction_strategy import *
 from crawl4ai.crawler_strategy import *
 # Define the JavaScript code to click the "Load More" button
-js_code = """
+js_code = ["""
 const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More'));
 loadMoreButton && loadMoreButton.click();
-"""
+"""]
 # Define the crawling strategy
 crawler_strategy = LocalSeleniumCrawlerStrategy(js_code=js_code)
 # Create the WebCrawler instance with the defined strategy
 crawler = WebCrawler(crawler_strategy=crawler_strategy)
 crawler = WebCrawler(verbose=True)
 crawler.warmup()
 # Run the crawler with keyword filtering and CSS selector
 result = crawler.run(
    url="https://www.nbcnews.com/business",
    js = js_code,
    extraction_strategy=CosineStrategy(
        semantic_filter="technology",
    ),
@@ -117,6 +114,7 @@ result = crawler.run(
 # Run the crawler with LLM extraction strategy
 result = crawler.run(
    url="https://www.nbcnews.com/business",
    js = js_code,
    extraction_strategy=LLMExtractionStrategy(
        provider="openai/gpt-4o",
        api_token=os.getenv('OPENAI_API_KEY'),
--- a/crawl4ai/web_crawler.py
+++ b/crawl4ai/web_crawler.py
@@ -123,6 +123,8 @@ class WebCrawler:
        # Initialize WebDriver for crawling
        t = time.time()
        if kwargs.get("js", None):
            self.crawler_strategy.js_code = kwargs.get("js")
        html = self.crawler_strategy.crawl(url)
        base64_image = None
        if screenshot:
--- a/docs/.DS_Store
+++ b/docs/.DS_Store
--- a/docs/examples/assets/basic.png
+++ b/docs/examples/assets/basic.png
--- a/docs/examples/assets/cosine_extraction.png
+++ b/docs/examples/assets/cosine_extraction.png
--- a/docs/examples/assets/exec_script.png
+++ b/docs/examples/assets/exec_script.png
--- a/docs/examples/assets/llm_extraction.png
+++ b/docs/examples/assets/llm_extraction.png
--- a/docs/examples/quickstart.py
+++ b/docs/examples/quickstart.py
@@ -166,10 +166,11 @@ def interactive_extraction(crawler):
    const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More'));
    loadMoreButton && loadMoreButton.click();
    """
-    crawler_strategy = LocalSeleniumCrawlerStrategy(js_code=js_code)
+    # crawler_strategy = LocalSeleniumCrawlerStrategy(js_code=js_code)
-    crawler = WebCrawler(crawler_strategy=crawler_strategy, always_by_pass_cache=True)
+    # crawler = WebCrawler(crawler_strategy=crawler_strategy, always_by_pass_cache=True)
    result = crawler.run(
        url="https://www.nbcnews.com/business",
        js = js_code
    )
    cprint("[LOG] 📦 [bold yellow]JavaScript Code (Load More button) result:[/bold yellow]")
    print_result(result)
@@ -182,10 +183,11 @@ def multiple_scrip(crawler):
    const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More'));
    loadMoreButton && loadMoreButton.click();
    """] * 2
-    crawler_strategy = LocalSeleniumCrawlerStrategy(js_code=js_code)
+    # crawler_strategy = LocalSeleniumCrawlerStrategy(js_code=js_code)
-    crawler = WebCrawler(crawler_strategy=crawler_strategy, always_by_pass_cache=True)
+    # crawler = WebCrawler(crawler_strategy=crawler_strategy, always_by_pass_cache=True)
    result = crawler.run(
        url="https://www.nbcnews.com/business",
        js = js_code  
    )
    cprint("[LOG] 📦 [bold yellow]JavaScript Code (Load More button) result:[/bold yellow]")
    print_result(result)
--- a/docs/examples/rest_call.py
+++ b/docs/examples/rest_call.py
@@ -1,13 +1,67 @@
-import requests
+
 import requests, base64, os
 data = {
-  "urls": [
+    "urls": [
-    "https://www.nbcnews.com/business"
+        "https://www.nbcnews.com/business"
-  ],
+    ],
-  "word_count_threshold": 5,
+    "screenshot": True,
  "screenshot": True
 }
-response = requests.post("https://crawl4ai.com/crawl", json=data) # OR local host if your run locally 
+
-response_data = response.json()
+# Example of executing a JS script on the page before extracting the content
-print(response_data['results'][0].keys())
+# data = {
 #     "urls": [
 #         "https://www.nbcnews.com/business"
 #     ],
 #     "screenshot": True,
 #     'js' : ["""
 #     const loadMoreButton = Array.from(document.querySelectorAll('button')).
 #     find(button => button.textContent.includes('Load More'));
 #     loadMoreButton && loadMoreButton.click();
 #     """]
 # }
 # Example of using a custom extraction strategy
 # data = {
 #     "urls": [
 #         "https://www.nbcnews.com/business"
 #     ],
 #     "extraction_strategy": "CosineStrategy",
 #     "extraction_strategy_args": {
 #         "semantic_filter": "inflation rent prices"
 #     },
 # }
 # Example of using LLM to extract content
 # data = {
 #     "urls": [
 #         "https://www.nbcnews.com/business"
 #     ],
 #     "extraction_strategy": "LLMExtractionStrategy",
 #     "extraction_strategy_args": {
 #         "provider": "groq/llama3-8b-8192",
 #         "api_token": os.environ.get("GROQ_API_KEY"),
 #         "instruction": """I am interested in only financial news, 
 #         and translate them in French."""
 #     },
 # }
 response = requests.post("https://crawl4ai.com/crawl", json=data) 
 result = response.json()['results'][0]
 print(result['markdown'])
 print(result['cleaned_html'])
 print(result['media'])
 print(result['extracted_content'])
 with open("screenshot.png", "wb") as f:
    f.write(base64.b64decode(result['screenshot']))
--- a/pages/partial/how_to_guide.html
+++ b/pages/partial/how_to_guide.html
@@ -157,9 +157,8 @@ with open("screenshot.png", "wb") as f:
 const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More'));
 loadMoreButton && loadMoreButton.click();
 """]
-crawler_strategy = LocalSeleniumCrawlerStrategy(js_code=js_code)
+crawler = WebCrawler(verbos=crawler_strategy, always_by_pass_cache=True)
-crawler = WebCrawler(crawler_strategy=crawler_strategy, always_by_pass_cache=True)
+result = crawler.run(url="https://www.nbcnews.com/business", js = js_code)</code></pre>
 result = crawler.run(url="https://www.nbcnews.com/business")</code></pre>
        <div class="">Remember that you can pass multiple JavaScript code snippets in the list. They all will be executed in the order they are passed.</div>
        </div>