Merge branch 'main' of https://github.com/unclecode/crawl4ai
This commit is contained in:
14
README.md
14
README.md
@@ -95,20 +95,17 @@ from crawl4ai.extraction_strategy import *
|
|||||||
from crawl4ai.crawler_strategy import *
|
from crawl4ai.crawler_strategy import *
|
||||||
|
|
||||||
# Define the JavaScript code to click the "Load More" button
|
# Define the JavaScript code to click the "Load More" button
|
||||||
js_code = """
|
js_code = ["""
|
||||||
const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More'));
|
const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More'));
|
||||||
loadMoreButton && loadMoreButton.click();
|
loadMoreButton && loadMoreButton.click();
|
||||||
"""
|
"""]
|
||||||
|
|
||||||
# Define the crawling strategy
|
|
||||||
crawler_strategy = LocalSeleniumCrawlerStrategy(js_code=js_code)
|
|
||||||
|
|
||||||
# Create the WebCrawler instance with the defined strategy
|
|
||||||
crawler = WebCrawler(crawler_strategy=crawler_strategy)
|
|
||||||
|
|
||||||
|
crawler = WebCrawler(verbose=True)
|
||||||
|
crawler.warmup()
|
||||||
# Run the crawler with keyword filtering and CSS selector
|
# Run the crawler with keyword filtering and CSS selector
|
||||||
result = crawler.run(
|
result = crawler.run(
|
||||||
url="https://www.nbcnews.com/business",
|
url="https://www.nbcnews.com/business",
|
||||||
|
js = js_code,
|
||||||
extraction_strategy=CosineStrategy(
|
extraction_strategy=CosineStrategy(
|
||||||
semantic_filter="technology",
|
semantic_filter="technology",
|
||||||
),
|
),
|
||||||
@@ -117,6 +114,7 @@ result = crawler.run(
|
|||||||
# Run the crawler with LLM extraction strategy
|
# Run the crawler with LLM extraction strategy
|
||||||
result = crawler.run(
|
result = crawler.run(
|
||||||
url="https://www.nbcnews.com/business",
|
url="https://www.nbcnews.com/business",
|
||||||
|
js = js_code,
|
||||||
extraction_strategy=LLMExtractionStrategy(
|
extraction_strategy=LLMExtractionStrategy(
|
||||||
provider="openai/gpt-4o",
|
provider="openai/gpt-4o",
|
||||||
api_token=os.getenv('OPENAI_API_KEY'),
|
api_token=os.getenv('OPENAI_API_KEY'),
|
||||||
|
|||||||
@@ -123,6 +123,8 @@ class WebCrawler:
|
|||||||
|
|
||||||
# Initialize WebDriver for crawling
|
# Initialize WebDriver for crawling
|
||||||
t = time.time()
|
t = time.time()
|
||||||
|
if kwargs.get("js", None):
|
||||||
|
self.crawler_strategy.js_code = kwargs.get("js")
|
||||||
html = self.crawler_strategy.crawl(url)
|
html = self.crawler_strategy.crawl(url)
|
||||||
base64_image = None
|
base64_image = None
|
||||||
if screenshot:
|
if screenshot:
|
||||||
|
|||||||
BIN
docs/.DS_Store
vendored
Normal file
BIN
docs/.DS_Store
vendored
Normal file
Binary file not shown.
BIN
docs/examples/assets/basic.png
Normal file
BIN
docs/examples/assets/basic.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 344 KiB |
BIN
docs/examples/assets/cosine_extraction.png
Normal file
BIN
docs/examples/assets/cosine_extraction.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 403 KiB |
BIN
docs/examples/assets/exec_script.png
Normal file
BIN
docs/examples/assets/exec_script.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 469 KiB |
BIN
docs/examples/assets/llm_extraction.png
Normal file
BIN
docs/examples/assets/llm_extraction.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 477 KiB |
@@ -166,10 +166,11 @@ def interactive_extraction(crawler):
|
|||||||
const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More'));
|
const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More'));
|
||||||
loadMoreButton && loadMoreButton.click();
|
loadMoreButton && loadMoreButton.click();
|
||||||
"""
|
"""
|
||||||
crawler_strategy = LocalSeleniumCrawlerStrategy(js_code=js_code)
|
# crawler_strategy = LocalSeleniumCrawlerStrategy(js_code=js_code)
|
||||||
crawler = WebCrawler(crawler_strategy=crawler_strategy, always_by_pass_cache=True)
|
# crawler = WebCrawler(crawler_strategy=crawler_strategy, always_by_pass_cache=True)
|
||||||
result = crawler.run(
|
result = crawler.run(
|
||||||
url="https://www.nbcnews.com/business",
|
url="https://www.nbcnews.com/business",
|
||||||
|
js = js_code
|
||||||
)
|
)
|
||||||
cprint("[LOG] 📦 [bold yellow]JavaScript Code (Load More button) result:[/bold yellow]")
|
cprint("[LOG] 📦 [bold yellow]JavaScript Code (Load More button) result:[/bold yellow]")
|
||||||
print_result(result)
|
print_result(result)
|
||||||
@@ -182,10 +183,11 @@ def multiple_scrip(crawler):
|
|||||||
const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More'));
|
const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More'));
|
||||||
loadMoreButton && loadMoreButton.click();
|
loadMoreButton && loadMoreButton.click();
|
||||||
"""] * 2
|
"""] * 2
|
||||||
crawler_strategy = LocalSeleniumCrawlerStrategy(js_code=js_code)
|
# crawler_strategy = LocalSeleniumCrawlerStrategy(js_code=js_code)
|
||||||
crawler = WebCrawler(crawler_strategy=crawler_strategy, always_by_pass_cache=True)
|
# crawler = WebCrawler(crawler_strategy=crawler_strategy, always_by_pass_cache=True)
|
||||||
result = crawler.run(
|
result = crawler.run(
|
||||||
url="https://www.nbcnews.com/business",
|
url="https://www.nbcnews.com/business",
|
||||||
|
js = js_code
|
||||||
)
|
)
|
||||||
cprint("[LOG] 📦 [bold yellow]JavaScript Code (Load More button) result:[/bold yellow]")
|
cprint("[LOG] 📦 [bold yellow]JavaScript Code (Load More button) result:[/bold yellow]")
|
||||||
print_result(result)
|
print_result(result)
|
||||||
|
|||||||
@@ -1,13 +1,67 @@
|
|||||||
import requests
|
|
||||||
|
import requests, base64, os
|
||||||
|
|
||||||
data = {
|
data = {
|
||||||
"urls": [
|
"urls": [
|
||||||
"https://www.nbcnews.com/business"
|
"https://www.nbcnews.com/business"
|
||||||
],
|
],
|
||||||
"word_count_threshold": 5,
|
"screenshot": True,
|
||||||
"screenshot": True
|
|
||||||
}
|
}
|
||||||
|
|
||||||
response = requests.post("https://crawl4ai.com/crawl", json=data) # OR local host if your run locally
|
|
||||||
response_data = response.json()
|
# Example of executing a JS script on the page before extracting the content
|
||||||
print(response_data['results'][0].keys())
|
# data = {
|
||||||
|
# "urls": [
|
||||||
|
# "https://www.nbcnews.com/business"
|
||||||
|
# ],
|
||||||
|
# "screenshot": True,
|
||||||
|
# 'js' : ["""
|
||||||
|
# const loadMoreButton = Array.from(document.querySelectorAll('button')).
|
||||||
|
# find(button => button.textContent.includes('Load More'));
|
||||||
|
# loadMoreButton && loadMoreButton.click();
|
||||||
|
# """]
|
||||||
|
# }
|
||||||
|
|
||||||
|
# Example of using a custom extraction strategy
|
||||||
|
# data = {
|
||||||
|
# "urls": [
|
||||||
|
# "https://www.nbcnews.com/business"
|
||||||
|
# ],
|
||||||
|
# "extraction_strategy": "CosineStrategy",
|
||||||
|
# "extraction_strategy_args": {
|
||||||
|
# "semantic_filter": "inflation rent prices"
|
||||||
|
# },
|
||||||
|
# }
|
||||||
|
|
||||||
|
# Example of using LLM to extract content
|
||||||
|
# data = {
|
||||||
|
# "urls": [
|
||||||
|
# "https://www.nbcnews.com/business"
|
||||||
|
# ],
|
||||||
|
# "extraction_strategy": "LLMExtractionStrategy",
|
||||||
|
# "extraction_strategy_args": {
|
||||||
|
# "provider": "groq/llama3-8b-8192",
|
||||||
|
# "api_token": os.environ.get("GROQ_API_KEY"),
|
||||||
|
# "instruction": """I am interested in only financial news,
|
||||||
|
# and translate them in French."""
|
||||||
|
# },
|
||||||
|
# }
|
||||||
|
|
||||||
|
response = requests.post("https://crawl4ai.com/crawl", json=data)
|
||||||
|
result = response.json()['results'][0]
|
||||||
|
|
||||||
|
print(result['markdown'])
|
||||||
|
print(result['cleaned_html'])
|
||||||
|
print(result['media'])
|
||||||
|
print(result['extracted_content'])
|
||||||
|
with open("screenshot.png", "wb") as f:
|
||||||
|
f.write(base64.b64decode(result['screenshot']))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -157,9 +157,8 @@ with open("screenshot.png", "wb") as f:
|
|||||||
const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More'));
|
const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More'));
|
||||||
loadMoreButton && loadMoreButton.click();
|
loadMoreButton && loadMoreButton.click();
|
||||||
"""]
|
"""]
|
||||||
crawler_strategy = LocalSeleniumCrawlerStrategy(js_code=js_code)
|
crawler = WebCrawler(verbos=crawler_strategy, always_by_pass_cache=True)
|
||||||
crawler = WebCrawler(crawler_strategy=crawler_strategy, always_by_pass_cache=True)
|
result = crawler.run(url="https://www.nbcnews.com/business", js = js_code)</code></pre>
|
||||||
result = crawler.run(url="https://www.nbcnews.com/business")</code></pre>
|
|
||||||
<div class="">Remember that you can pass multiple JavaScript code snippets in the list. They all will be executed in the order they are passed.</div>
|
<div class="">Remember that you can pass multiple JavaScript code snippets in the list. They all will be executed in the order they are passed.</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user