Merge branch 'main' of https://github.com/unclecode/crawl4ai
This commit is contained in:
14
README.md
14
README.md
@@ -95,20 +95,17 @@ from crawl4ai.extraction_strategy import *
|
||||
from crawl4ai.crawler_strategy import *
|
||||
|
||||
# Define the JavaScript code to click the "Load More" button
|
||||
js_code = """
|
||||
js_code = ["""
|
||||
const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More'));
|
||||
loadMoreButton && loadMoreButton.click();
|
||||
"""
|
||||
|
||||
# Define the crawling strategy
|
||||
crawler_strategy = LocalSeleniumCrawlerStrategy(js_code=js_code)
|
||||
|
||||
# Create the WebCrawler instance with the defined strategy
|
||||
crawler = WebCrawler(crawler_strategy=crawler_strategy)
|
||||
"""]
|
||||
|
||||
crawler = WebCrawler(verbose=True)
|
||||
crawler.warmup()
|
||||
# Run the crawler with keyword filtering and CSS selector
|
||||
result = crawler.run(
|
||||
url="https://www.nbcnews.com/business",
|
||||
js = js_code,
|
||||
extraction_strategy=CosineStrategy(
|
||||
semantic_filter="technology",
|
||||
),
|
||||
@@ -117,6 +114,7 @@ result = crawler.run(
|
||||
# Run the crawler with LLM extraction strategy
|
||||
result = crawler.run(
|
||||
url="https://www.nbcnews.com/business",
|
||||
js = js_code,
|
||||
extraction_strategy=LLMExtractionStrategy(
|
||||
provider="openai/gpt-4o",
|
||||
api_token=os.getenv('OPENAI_API_KEY'),
|
||||
|
||||
@@ -123,6 +123,8 @@ class WebCrawler:
|
||||
|
||||
# Initialize WebDriver for crawling
|
||||
t = time.time()
|
||||
if kwargs.get("js", None):
|
||||
self.crawler_strategy.js_code = kwargs.get("js")
|
||||
html = self.crawler_strategy.crawl(url)
|
||||
base64_image = None
|
||||
if screenshot:
|
||||
|
||||
BIN
docs/.DS_Store
vendored
Normal file
BIN
docs/.DS_Store
vendored
Normal file
Binary file not shown.
BIN
docs/examples/assets/basic.png
Normal file
BIN
docs/examples/assets/basic.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 344 KiB |
BIN
docs/examples/assets/cosine_extraction.png
Normal file
BIN
docs/examples/assets/cosine_extraction.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 403 KiB |
BIN
docs/examples/assets/exec_script.png
Normal file
BIN
docs/examples/assets/exec_script.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 469 KiB |
BIN
docs/examples/assets/llm_extraction.png
Normal file
BIN
docs/examples/assets/llm_extraction.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 477 KiB |
@@ -166,10 +166,11 @@ def interactive_extraction(crawler):
|
||||
const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More'));
|
||||
loadMoreButton && loadMoreButton.click();
|
||||
"""
|
||||
crawler_strategy = LocalSeleniumCrawlerStrategy(js_code=js_code)
|
||||
crawler = WebCrawler(crawler_strategy=crawler_strategy, always_by_pass_cache=True)
|
||||
# crawler_strategy = LocalSeleniumCrawlerStrategy(js_code=js_code)
|
||||
# crawler = WebCrawler(crawler_strategy=crawler_strategy, always_by_pass_cache=True)
|
||||
result = crawler.run(
|
||||
url="https://www.nbcnews.com/business",
|
||||
js = js_code
|
||||
)
|
||||
cprint("[LOG] 📦 [bold yellow]JavaScript Code (Load More button) result:[/bold yellow]")
|
||||
print_result(result)
|
||||
@@ -182,10 +183,11 @@ def multiple_scrip(crawler):
|
||||
const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More'));
|
||||
loadMoreButton && loadMoreButton.click();
|
||||
"""] * 2
|
||||
crawler_strategy = LocalSeleniumCrawlerStrategy(js_code=js_code)
|
||||
crawler = WebCrawler(crawler_strategy=crawler_strategy, always_by_pass_cache=True)
|
||||
# crawler_strategy = LocalSeleniumCrawlerStrategy(js_code=js_code)
|
||||
# crawler = WebCrawler(crawler_strategy=crawler_strategy, always_by_pass_cache=True)
|
||||
result = crawler.run(
|
||||
url="https://www.nbcnews.com/business",
|
||||
js = js_code
|
||||
)
|
||||
cprint("[LOG] 📦 [bold yellow]JavaScript Code (Load More button) result:[/bold yellow]")
|
||||
print_result(result)
|
||||
|
||||
@@ -1,13 +1,67 @@
|
||||
import requests
|
||||
|
||||
import requests, base64, os
|
||||
|
||||
data = {
|
||||
"urls": [
|
||||
"https://www.nbcnews.com/business"
|
||||
],
|
||||
"word_count_threshold": 5,
|
||||
"screenshot": True
|
||||
"urls": [
|
||||
"https://www.nbcnews.com/business"
|
||||
],
|
||||
"screenshot": True,
|
||||
}
|
||||
|
||||
response = requests.post("https://crawl4ai.com/crawl", json=data) # OR local host if your run locally
|
||||
response_data = response.json()
|
||||
print(response_data['results'][0].keys())
|
||||
|
||||
# Example of executing a JS script on the page before extracting the content
|
||||
# data = {
|
||||
# "urls": [
|
||||
# "https://www.nbcnews.com/business"
|
||||
# ],
|
||||
# "screenshot": True,
|
||||
# 'js' : ["""
|
||||
# const loadMoreButton = Array.from(document.querySelectorAll('button')).
|
||||
# find(button => button.textContent.includes('Load More'));
|
||||
# loadMoreButton && loadMoreButton.click();
|
||||
# """]
|
||||
# }
|
||||
|
||||
# Example of using a custom extraction strategy
|
||||
# data = {
|
||||
# "urls": [
|
||||
# "https://www.nbcnews.com/business"
|
||||
# ],
|
||||
# "extraction_strategy": "CosineStrategy",
|
||||
# "extraction_strategy_args": {
|
||||
# "semantic_filter": "inflation rent prices"
|
||||
# },
|
||||
# }
|
||||
|
||||
# Example of using LLM to extract content
|
||||
# data = {
|
||||
# "urls": [
|
||||
# "https://www.nbcnews.com/business"
|
||||
# ],
|
||||
# "extraction_strategy": "LLMExtractionStrategy",
|
||||
# "extraction_strategy_args": {
|
||||
# "provider": "groq/llama3-8b-8192",
|
||||
# "api_token": os.environ.get("GROQ_API_KEY"),
|
||||
# "instruction": """I am interested in only financial news,
|
||||
# and translate them in French."""
|
||||
# },
|
||||
# }
|
||||
|
||||
response = requests.post("https://crawl4ai.com/crawl", json=data)
|
||||
result = response.json()['results'][0]
|
||||
|
||||
print(result['markdown'])
|
||||
print(result['cleaned_html'])
|
||||
print(result['media'])
|
||||
print(result['extracted_content'])
|
||||
with open("screenshot.png", "wb") as f:
|
||||
f.write(base64.b64decode(result['screenshot']))
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -157,9 +157,8 @@ with open("screenshot.png", "wb") as f:
|
||||
const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More'));
|
||||
loadMoreButton && loadMoreButton.click();
|
||||
"""]
|
||||
crawler_strategy = LocalSeleniumCrawlerStrategy(js_code=js_code)
|
||||
crawler = WebCrawler(crawler_strategy=crawler_strategy, always_by_pass_cache=True)
|
||||
result = crawler.run(url="https://www.nbcnews.com/business")</code></pre>
|
||||
crawler = WebCrawler(verbos=crawler_strategy, always_by_pass_cache=True)
|
||||
result = crawler.run(url="https://www.nbcnews.com/business", js = js_code)</code></pre>
|
||||
<div class="">Remember that you can pass multiple JavaScript code snippets in the list. They all will be executed in the order they are passed.</div>
|
||||
</div>
|
||||
|
||||
|
||||
Reference in New Issue
Block a user