Merge branch 'main' of https://github.com/unclecode/crawl4ai

2024-06-07 12:44:41 +00:00
parent b3a150f3d1 a19379aa58
commit 04808b5dc9
10 changed files with 79 additions and 24 deletions
--- a/README.md
+++ b/README.md
@@ -95,20 +95,17 @@ from crawl4ai.extraction_strategy import *
 from crawl4ai.crawler_strategy import *

 # Define the JavaScript code to click the "Load More" button
-js_code = """
+js_code = ["""
 const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More'));
 loadMoreButton && loadMoreButton.click();
-"""
-
-# Define the crawling strategy
-crawler_strategy = LocalSeleniumCrawlerStrategy(js_code=js_code)
-
-# Create the WebCrawler instance with the defined strategy
-crawler = WebCrawler(crawler_strategy=crawler_strategy)
+"""]

+crawler = WebCrawler(verbose=True)
+crawler.warmup()
 # Run the crawler with keyword filtering and CSS selector
 result = crawler.run(
    url="https://www.nbcnews.com/business",
+    js = js_code,
    extraction_strategy=CosineStrategy(
        semantic_filter="technology",
    ),
@@ -117,6 +114,7 @@ result = crawler.run(
 # Run the crawler with LLM extraction strategy
 result = crawler.run(
    url="https://www.nbcnews.com/business",
+    js = js_code,
    extraction_strategy=LLMExtractionStrategy(
        provider="openai/gpt-4o",
        api_token=os.getenv('OPENAI_API_KEY'),
--- a/crawl4ai/web_crawler.py
+++ b/crawl4ai/web_crawler.py
@@ -123,6 +123,8 @@ class WebCrawler:

        # Initialize WebDriver for crawling
        t = time.time()
+        if kwargs.get("js", None):
+            self.crawler_strategy.js_code = kwargs.get("js")
        html = self.crawler_strategy.crawl(url)
        base64_image = None
        if screenshot:
--- a/docs/.DS_Store
+++ b/docs/.DS_Store
--- a/docs/examples/assets/basic.png
+++ b/docs/examples/assets/basic.png
--- a/docs/examples/assets/cosine_extraction.png
+++ b/docs/examples/assets/cosine_extraction.png
--- a/docs/examples/assets/exec_script.png
+++ b/docs/examples/assets/exec_script.png
--- a/docs/examples/assets/llm_extraction.png
+++ b/docs/examples/assets/llm_extraction.png
--- a/docs/examples/quickstart.py
+++ b/docs/examples/quickstart.py
@@ -166,10 +166,11 @@ def interactive_extraction(crawler):
    const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More'));
    loadMoreButton && loadMoreButton.click();
    """
-    crawler_strategy = LocalSeleniumCrawlerStrategy(js_code=js_code)
-    crawler = WebCrawler(crawler_strategy=crawler_strategy, always_by_pass_cache=True)
+    # crawler_strategy = LocalSeleniumCrawlerStrategy(js_code=js_code)
+    # crawler = WebCrawler(crawler_strategy=crawler_strategy, always_by_pass_cache=True)
    result = crawler.run(
        url="https://www.nbcnews.com/business",
+        js = js_code
    )
    cprint("[LOG] 📦 [bold yellow]JavaScript Code (Load More button) result:[/bold yellow]")
    print_result(result)
@@ -182,10 +183,11 @@ def multiple_scrip(crawler):
    const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More'));
    loadMoreButton && loadMoreButton.click();
    """] * 2
-    crawler_strategy = LocalSeleniumCrawlerStrategy(js_code=js_code)
-    crawler = WebCrawler(crawler_strategy=crawler_strategy, always_by_pass_cache=True)
+    # crawler_strategy = LocalSeleniumCrawlerStrategy(js_code=js_code)
+    # crawler = WebCrawler(crawler_strategy=crawler_strategy, always_by_pass_cache=True)
    result = crawler.run(
        url="https://www.nbcnews.com/business",
+        js = js_code  
    )
    cprint("[LOG] 📦 [bold yellow]JavaScript Code (Load More button) result:[/bold yellow]")
    print_result(result)
--- a/docs/examples/rest_call.py
+++ b/docs/examples/rest_call.py
@@ -1,13 +1,67 @@
-import requests
+
+import requests, base64, os

 data = {
-  "urls": [
-    "https://www.nbcnews.com/business"
-  ],
-  "word_count_threshold": 5,
-  "screenshot": True
+    "urls": [
+        "https://www.nbcnews.com/business"
+    ],
+    "screenshot": True,
 }

-response = requests.post("https://crawl4ai.com/crawl", json=data) # OR local host if your run locally 
-response_data = response.json()
-print(response_data['results'][0].keys())
+
+# Example of executing a JS script on the page before extracting the content
+# data = {
+#     "urls": [
+#         "https://www.nbcnews.com/business"
+#     ],
+#     "screenshot": True,
+#     'js' : ["""
+#     const loadMoreButton = Array.from(document.querySelectorAll('button')).
+#     find(button => button.textContent.includes('Load More'));
+#     loadMoreButton && loadMoreButton.click();
+#     """]
+# }
+
+# Example of using a custom extraction strategy
+# data = {
+#     "urls": [
+#         "https://www.nbcnews.com/business"
+#     ],
+#     "extraction_strategy": "CosineStrategy",
+#     "extraction_strategy_args": {
+#         "semantic_filter": "inflation rent prices"
+#     },
+# }
+
+# Example of using LLM to extract content
+# data = {
+#     "urls": [
+#         "https://www.nbcnews.com/business"
+#     ],
+#     "extraction_strategy": "LLMExtractionStrategy",
+#     "extraction_strategy_args": {
+#         "provider": "groq/llama3-8b-8192",
+#         "api_token": os.environ.get("GROQ_API_KEY"),
+#         "instruction": """I am interested in only financial news, 
+#         and translate them in French."""
+#     },
+# }
+
+response = requests.post("https://crawl4ai.com/crawl", json=data) 
+result = response.json()['results'][0]
+
+print(result['markdown'])
+print(result['cleaned_html'])
+print(result['media'])
+print(result['extracted_content'])
+with open("screenshot.png", "wb") as f:
+    f.write(base64.b64decode(result['screenshot']))
+
+
+
+
+
+
+
+
+
--- a/pages/partial/how_to_guide.html
+++ b/pages/partial/how_to_guide.html
@@ -157,9 +157,8 @@ with open("screenshot.png", "wb") as f:
 const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More'));
 loadMoreButton && loadMoreButton.click();
 """]
-crawler_strategy = LocalSeleniumCrawlerStrategy(js_code=js_code)
-crawler = WebCrawler(crawler_strategy=crawler_strategy, always_by_pass_cache=True)
-result = crawler.run(url="https://www.nbcnews.com/business")</code></pre>
+crawler = WebCrawler(verbos=crawler_strategy, always_by_pass_cache=True)
+result = crawler.run(url="https://www.nbcnews.com/business", js = js_code)</code></pre>
        <div class="">Remember that you can pass multiple JavaScript code snippets in the list. They all will be executed in the order they are passed.</div>
        </div>