diff --git a/README.md b/README.md index a49024f0..079382e2 100644 --- a/README.md +++ b/README.md @@ -95,20 +95,17 @@ from crawl4ai.extraction_strategy import * from crawl4ai.crawler_strategy import * # Define the JavaScript code to click the "Load More" button -js_code = """ +js_code = [""" const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click(); -""" - -# Define the crawling strategy -crawler_strategy = LocalSeleniumCrawlerStrategy(js_code=js_code) - -# Create the WebCrawler instance with the defined strategy -crawler = WebCrawler(crawler_strategy=crawler_strategy) +"""] +crawler = WebCrawler(verbose=True) +crawler.warmup() # Run the crawler with keyword filtering and CSS selector result = crawler.run( url="https://www.nbcnews.com/business", + js = js_code, extraction_strategy=CosineStrategy( semantic_filter="technology", ), @@ -117,6 +114,7 @@ result = crawler.run( # Run the crawler with LLM extraction strategy result = crawler.run( url="https://www.nbcnews.com/business", + js = js_code, extraction_strategy=LLMExtractionStrategy( provider="openai/gpt-4o", api_token=os.getenv('OPENAI_API_KEY'), diff --git a/crawl4ai/web_crawler.py b/crawl4ai/web_crawler.py index f27bf8cf..6a6445f8 100644 --- a/crawl4ai/web_crawler.py +++ b/crawl4ai/web_crawler.py @@ -123,6 +123,8 @@ class WebCrawler: # Initialize WebDriver for crawling t = time.time() + if kwargs.get("js", None): + self.crawler_strategy.js_code = kwargs.get("js") html = self.crawler_strategy.crawl(url) base64_image = None if screenshot: diff --git a/docs/.DS_Store b/docs/.DS_Store new file mode 100644 index 00000000..94e8e06d Binary files /dev/null and b/docs/.DS_Store differ diff --git a/docs/examples/assets/basic.png b/docs/examples/assets/basic.png new file mode 100644 index 00000000..2506c639 Binary files /dev/null and b/docs/examples/assets/basic.png differ diff --git a/docs/examples/assets/cosine_extraction.png b/docs/examples/assets/cosine_extraction.png new file mode 100644 index 00000000..19252ad4 Binary files /dev/null and b/docs/examples/assets/cosine_extraction.png differ diff --git a/docs/examples/assets/exec_script.png b/docs/examples/assets/exec_script.png new file mode 100644 index 00000000..c2e478f7 Binary files /dev/null and b/docs/examples/assets/exec_script.png differ diff --git a/docs/examples/assets/llm_extraction.png b/docs/examples/assets/llm_extraction.png new file mode 100644 index 00000000..95d2accb Binary files /dev/null and b/docs/examples/assets/llm_extraction.png differ diff --git a/docs/examples/quickstart.py b/docs/examples/quickstart.py index 0fdd3772..44b0c0e9 100644 --- a/docs/examples/quickstart.py +++ b/docs/examples/quickstart.py @@ -166,10 +166,11 @@ def interactive_extraction(crawler): const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click(); """ - crawler_strategy = LocalSeleniumCrawlerStrategy(js_code=js_code) - crawler = WebCrawler(crawler_strategy=crawler_strategy, always_by_pass_cache=True) + # crawler_strategy = LocalSeleniumCrawlerStrategy(js_code=js_code) + # crawler = WebCrawler(crawler_strategy=crawler_strategy, always_by_pass_cache=True) result = crawler.run( url="https://www.nbcnews.com/business", + js = js_code ) cprint("[LOG] 📦 [bold yellow]JavaScript Code (Load More button) result:[/bold yellow]") print_result(result) @@ -182,10 +183,11 @@ def multiple_scrip(crawler): const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click(); """] * 2 - crawler_strategy = LocalSeleniumCrawlerStrategy(js_code=js_code) - crawler = WebCrawler(crawler_strategy=crawler_strategy, always_by_pass_cache=True) + # crawler_strategy = LocalSeleniumCrawlerStrategy(js_code=js_code) + # crawler = WebCrawler(crawler_strategy=crawler_strategy, always_by_pass_cache=True) result = crawler.run( url="https://www.nbcnews.com/business", + js = js_code ) cprint("[LOG] 📦 [bold yellow]JavaScript Code (Load More button) result:[/bold yellow]") print_result(result) diff --git a/docs/examples/rest_call.py b/docs/examples/rest_call.py index eb462221..0dd39350 100644 --- a/docs/examples/rest_call.py +++ b/docs/examples/rest_call.py @@ -1,18 +1,54 @@ - - -import requests, base64 +import requests, base64, os data = { - "urls": [ - "https://www.nbcnews.com/business" - ], - "screenshot": True + "urls": [ + "https://www.nbcnews.com/business" + ], + "screenshot": True, } -response = requests.post("https://crawl4ai.com/crawl", json=data) # OR local host if your run locally -result = response.json()['results'][0] +# Example of executing a JS script on the page before extracting the content +# data = { +# "urls": [ +# "https://www.nbcnews.com/business" +# ], +# "screenshot": True, +# 'js' : [""" +# const loadMoreButton = Array.from(document.querySelectorAll('button')). +# find(button => button.textContent.includes('Load More')); +# loadMoreButton && loadMoreButton.click(); +# """] +# } + +# Example of using a custom extraction strategy +# data = { +# "urls": [ +# "https://www.nbcnews.com/business" +# ], +# "extraction_strategy": "CosineStrategy", +# "extraction_strategy_args": { +# "semantic_filter": "inflation rent prices" +# }, +# } + +# Example of using LLM to extract content +# data = { +# "urls": [ +# "https://www.nbcnews.com/business" +# ], +# "extraction_strategy": "LLMExtractionStrategy", +# "extraction_strategy_args": { +# "provider": "groq/llama3-8b-8192", +# "api_token": os.environ.get("GROQ_API_KEY"), +# "instruction": """I am interested in only financial news, +# and translate them in French.""" +# }, +# } + +response = requests.post("https://crawl4ai.com/crawl", json=data) +result = response.json()['results'][0] print(result['markdown']) print(result['cleaned_html']) @@ -24,3 +60,8 @@ with open("screenshot.png", "wb") as f: + + + + + diff --git a/docs/simplicity.png b/docs/simplicity.png deleted file mode 100644 index fcca1cf8..00000000 Binary files a/docs/simplicity.png and /dev/null differ diff --git a/pages/partial/how_to_guide.html b/pages/partial/how_to_guide.html index 270fcacb..785915c1 100644 --- a/pages/partial/how_to_guide.html +++ b/pages/partial/how_to_guide.html @@ -157,9 +157,8 @@ with open("screenshot.png", "wb") as f: const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click(); """] -crawler_strategy = LocalSeleniumCrawlerStrategy(js_code=js_code) -crawler = WebCrawler(crawler_strategy=crawler_strategy, always_by_pass_cache=True) -result = crawler.run(url="https://www.nbcnews.com/business") +crawler = WebCrawler(verbos=crawler_strategy, always_by_pass_cache=True) +result = crawler.run(url="https://www.nbcnews.com/business", js = js_code)