This commit is contained in:
Unclecode
2024-06-07 12:44:41 +00:00
10 changed files with 79 additions and 24 deletions

View File

@@ -95,20 +95,17 @@ from crawl4ai.extraction_strategy import *
from crawl4ai.crawler_strategy import * from crawl4ai.crawler_strategy import *
# Define the JavaScript code to click the "Load More" button # Define the JavaScript code to click the "Load More" button
js_code = """ js_code = ["""
const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More'));
loadMoreButton && loadMoreButton.click(); loadMoreButton && loadMoreButton.click();
""" """]
# Define the crawling strategy
crawler_strategy = LocalSeleniumCrawlerStrategy(js_code=js_code)
# Create the WebCrawler instance with the defined strategy
crawler = WebCrawler(crawler_strategy=crawler_strategy)
crawler = WebCrawler(verbose=True)
crawler.warmup()
# Run the crawler with keyword filtering and CSS selector # Run the crawler with keyword filtering and CSS selector
result = crawler.run( result = crawler.run(
url="https://www.nbcnews.com/business", url="https://www.nbcnews.com/business",
js = js_code,
extraction_strategy=CosineStrategy( extraction_strategy=CosineStrategy(
semantic_filter="technology", semantic_filter="technology",
), ),
@@ -117,6 +114,7 @@ result = crawler.run(
# Run the crawler with LLM extraction strategy # Run the crawler with LLM extraction strategy
result = crawler.run( result = crawler.run(
url="https://www.nbcnews.com/business", url="https://www.nbcnews.com/business",
js = js_code,
extraction_strategy=LLMExtractionStrategy( extraction_strategy=LLMExtractionStrategy(
provider="openai/gpt-4o", provider="openai/gpt-4o",
api_token=os.getenv('OPENAI_API_KEY'), api_token=os.getenv('OPENAI_API_KEY'),

View File

@@ -123,6 +123,8 @@ class WebCrawler:
# Initialize WebDriver for crawling # Initialize WebDriver for crawling
t = time.time() t = time.time()
if kwargs.get("js", None):
self.crawler_strategy.js_code = kwargs.get("js")
html = self.crawler_strategy.crawl(url) html = self.crawler_strategy.crawl(url)
base64_image = None base64_image = None
if screenshot: if screenshot:

BIN
docs/.DS_Store vendored Normal file

Binary file not shown.

Binary file not shown.

After

Width:  |  Height:  |  Size: 344 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 403 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 469 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 477 KiB

View File

@@ -166,10 +166,11 @@ def interactive_extraction(crawler):
const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More'));
loadMoreButton && loadMoreButton.click(); loadMoreButton && loadMoreButton.click();
""" """
crawler_strategy = LocalSeleniumCrawlerStrategy(js_code=js_code) # crawler_strategy = LocalSeleniumCrawlerStrategy(js_code=js_code)
crawler = WebCrawler(crawler_strategy=crawler_strategy, always_by_pass_cache=True) # crawler = WebCrawler(crawler_strategy=crawler_strategy, always_by_pass_cache=True)
result = crawler.run( result = crawler.run(
url="https://www.nbcnews.com/business", url="https://www.nbcnews.com/business",
js = js_code
) )
cprint("[LOG] 📦 [bold yellow]JavaScript Code (Load More button) result:[/bold yellow]") cprint("[LOG] 📦 [bold yellow]JavaScript Code (Load More button) result:[/bold yellow]")
print_result(result) print_result(result)
@@ -182,10 +183,11 @@ def multiple_scrip(crawler):
const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More'));
loadMoreButton && loadMoreButton.click(); loadMoreButton && loadMoreButton.click();
"""] * 2 """] * 2
crawler_strategy = LocalSeleniumCrawlerStrategy(js_code=js_code) # crawler_strategy = LocalSeleniumCrawlerStrategy(js_code=js_code)
crawler = WebCrawler(crawler_strategy=crawler_strategy, always_by_pass_cache=True) # crawler = WebCrawler(crawler_strategy=crawler_strategy, always_by_pass_cache=True)
result = crawler.run( result = crawler.run(
url="https://www.nbcnews.com/business", url="https://www.nbcnews.com/business",
js = js_code
) )
cprint("[LOG] 📦 [bold yellow]JavaScript Code (Load More button) result:[/bold yellow]") cprint("[LOG] 📦 [bold yellow]JavaScript Code (Load More button) result:[/bold yellow]")
print_result(result) print_result(result)

View File

@@ -1,13 +1,67 @@
import requests
import requests, base64, os
data = { data = {
"urls": [ "urls": [
"https://www.nbcnews.com/business" "https://www.nbcnews.com/business"
], ],
"word_count_threshold": 5, "screenshot": True,
"screenshot": True
} }
response = requests.post("https://crawl4ai.com/crawl", json=data) # OR local host if your run locally
response_data = response.json() # Example of executing a JS script on the page before extracting the content
print(response_data['results'][0].keys()) # data = {
# "urls": [
# "https://www.nbcnews.com/business"
# ],
# "screenshot": True,
# 'js' : ["""
# const loadMoreButton = Array.from(document.querySelectorAll('button')).
# find(button => button.textContent.includes('Load More'));
# loadMoreButton && loadMoreButton.click();
# """]
# }
# Example of using a custom extraction strategy
# data = {
# "urls": [
# "https://www.nbcnews.com/business"
# ],
# "extraction_strategy": "CosineStrategy",
# "extraction_strategy_args": {
# "semantic_filter": "inflation rent prices"
# },
# }
# Example of using LLM to extract content
# data = {
# "urls": [
# "https://www.nbcnews.com/business"
# ],
# "extraction_strategy": "LLMExtractionStrategy",
# "extraction_strategy_args": {
# "provider": "groq/llama3-8b-8192",
# "api_token": os.environ.get("GROQ_API_KEY"),
# "instruction": """I am interested in only financial news,
# and translate them in French."""
# },
# }
response = requests.post("https://crawl4ai.com/crawl", json=data)
result = response.json()['results'][0]
print(result['markdown'])
print(result['cleaned_html'])
print(result['media'])
print(result['extracted_content'])
with open("screenshot.png", "wb") as f:
f.write(base64.b64decode(result['screenshot']))

View File

@@ -157,9 +157,8 @@ with open("screenshot.png", "wb") as f:
const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More'));
loadMoreButton && loadMoreButton.click(); loadMoreButton && loadMoreButton.click();
"""] """]
crawler_strategy = LocalSeleniumCrawlerStrategy(js_code=js_code) crawler = WebCrawler(verbos=crawler_strategy, always_by_pass_cache=True)
crawler = WebCrawler(crawler_strategy=crawler_strategy, always_by_pass_cache=True) result = crawler.run(url="https://www.nbcnews.com/business", js = js_code)</code></pre>
result = crawler.run(url="https://www.nbcnews.com/business")</code></pre>
<div class="">Remember that you can pass multiple JavaScript code snippets in the list. They all will be executed in the order they are passed.</div> <div class="">Remember that you can pass multiple JavaScript code snippets in the list. They all will be executed in the order they are passed.</div>
</div> </div>