- User agent

- Extract Links
- Extract Metadata
- Update Readme
- Update REST API document
This commit is contained in:
unclecode
2024-06-08 17:59:42 +08:00
parent 9c34b30723
commit b3a0edaa6d
12 changed files with 155 additions and 75 deletions

Binary file not shown.

Before

Width:  |  Height:  |  Size: 344 KiB

After

Width:  |  Height:  |  Size: 372 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 537 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 419 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 485 KiB

View File

@@ -1,75 +1,64 @@
import requests, base64, os
data = {
"urls": ["https://www.nbcnews.com/business"],
"screenshot": True,
}
response = requests.post("https://crawl4ai.com/crawl", json=data)
result = response.json()['results'][0]
print(result.keys())
# dict_keys(['url', 'html', 'success', 'cleaned_html', 'media',
# 'links', 'screenshot', 'markdown', 'extracted_content',
# 'metadata', 'error_message'])
with open("screenshot.png", "wb") as f:
f.write(base64.b64decode(result['screenshot']))
# Example of filtering the content using CSS selectors
data = {
"urls": [
"https://www.nbcnews.com/business"
],
"css_selector": "article",
"screenshot": True,
}
# Example of executing a JS script on the page before extracting the content
data = {
"urls": [
"https://www.nbcnews.com/business"
],
"screenshot": True,
'js' : ["""
const loadMoreButton = Array.from(document.querySelectorAll('button')).
find(button => button.textContent.includes('Load More'));
loadMoreButton && loadMoreButton.click();
"""]
}
# Example of filtering the content using CSS selectors
# data = {
# "urls": [
# "https://www.nbcnews.com/business"
# ],
# "css_selector": "article",
# "screenshot": True,
# }
# Example of executing a JS script on the page before extracting the content
# data = {
# "urls": [
# "https://www.nbcnews.com/business"
# ],
# "screenshot": True,
# 'js' : ["""
# const loadMoreButton = Array.from(document.querySelectorAll('button')).
# find(button => button.textContent.includes('Load More'));
# loadMoreButton && loadMoreButton.click();
# """]
# }
# Example of using a custom extraction strategy
# data = {
# "urls": [
# "https://www.nbcnews.com/business"
# ],
# "extraction_strategy": "CosineStrategy",
# "extraction_strategy_args": {
# "semantic_filter": "inflation rent prices"
# },
# }
data = {
"urls": [
"https://www.nbcnews.com/business"
],
"extraction_strategy": "CosineStrategy",
"extraction_strategy_args": {
"semantic_filter": "inflation rent prices"
},
}
# Example of using LLM to extract content
# data = {
# "urls": [
# "https://www.nbcnews.com/business"
# ],
# "extraction_strategy": "LLMExtractionStrategy",
# "extraction_strategy_args": {
# "provider": "groq/llama3-8b-8192",
# "api_token": os.environ.get("GROQ_API_KEY"),
# "instruction": """I am interested in only financial news,
# and translate them in French."""
# },
# }
response = requests.post("https://crawl4ai.com/crawl", json=data)
result = response.json()['results'][0]
print(result['markdown'])
print(result['cleaned_html'])
print(result['media'])
print(result['extracted_content'])
with open("screenshot.png", "wb") as f:
f.write(base64.b64decode(result['screenshot']))
data = {
"urls": [
"https://www.nbcnews.com/business"
],
"extraction_strategy": "LLMExtractionStrategy",
"extraction_strategy_args": {
"provider": "groq/llama3-8b-8192",
"api_token": os.environ.get("GROQ_API_KEY"),
"instruction": """I am interested in only financial news,
and translate them in French."""
},
}