- User agent
- Extract Links - Extract Metadata - Update Readme - Update REST API document
This commit is contained in:
@@ -1 +1,4 @@
|
|||||||
# Changelog
|
# Changelog
|
||||||
|
|
||||||
|
## TODO:
|
||||||
|
- User agent: "user_agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.101.76 Safari/537.36",
|
||||||
31
README.md
31
README.md
@@ -14,6 +14,9 @@ Crawl4AI has one clear task: to simplify crawling and extract useful information
|
|||||||
|
|
||||||
### v0.2.3
|
### v0.2.3
|
||||||
- 🎨 Extract and return all media tags (Images, Audio, and Video). Check `result.media`
|
- 🎨 Extract and return all media tags (Images, Audio, and Video). Check `result.media`
|
||||||
|
- 🔗 Extrat all external and internal links. Check `result.links`
|
||||||
|
- 📚 Extract metadata from the page. Check `result.metadata`
|
||||||
|
- 🕵️ Support `user_agent` parameter to set the user agent for the HTTP requests.
|
||||||
- 🖼️ Take [screenshots](#taking-screenshots) of the page.
|
- 🖼️ Take [screenshots](#taking-screenshots) of the page.
|
||||||
|
|
||||||
### v0.2.2
|
### v0.2.2
|
||||||
@@ -32,7 +35,7 @@ Crawl4AI has one clear task: to simplify crawling and extract useful information
|
|||||||
|
|
||||||
## Power and Simplicity of Crawl4AI 🚀
|
## Power and Simplicity of Crawl4AI 🚀
|
||||||
|
|
||||||
The most easy way! If you don't want to install any library, you can use the REST API on my server. But remember, this is just a simple server. I may improve its capacity if I see there is demand.
|
The most easy way! If you don't want to install any library, you can use the REST API on my server. But remember, this is just a simple server. I may improve its capacity if I see there is demand. You can find ll examples of REST API in this colab notebook. [](https://colab.research.google.com/drive/1zODYjhemJ5bUmYceWpVoBMVpd0ofzNBZ?usp=sharing)
|
||||||
|
|
||||||
```python
|
```python
|
||||||
import requests
|
import requests
|
||||||
@@ -41,7 +44,6 @@ data = {
|
|||||||
"urls": [
|
"urls": [
|
||||||
"https://www.nbcnews.com/business"
|
"https://www.nbcnews.com/business"
|
||||||
],
|
],
|
||||||
"word_count_threshold": 5,
|
|
||||||
"screenshot": True
|
"screenshot": True
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -242,8 +244,12 @@ To use the REST API, send a POST request to `http://localhost:8000/crawl` with t
|
|||||||
"url": "https://www.nbcnews.com/business",
|
"url": "https://www.nbcnews.com/business",
|
||||||
"extracted_content": "...",
|
"extracted_content": "...",
|
||||||
"html": "...",
|
"html": "...",
|
||||||
|
"cleaned_html": "...",
|
||||||
"markdown": "...",
|
"markdown": "...",
|
||||||
"metadata": {...}
|
"media": {...},
|
||||||
|
"links": {...},
|
||||||
|
"metadata": {...},
|
||||||
|
"screenshots": "...",
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
@@ -282,6 +288,24 @@ Crawl result without raw HTML content:
|
|||||||
result = crawler.run(url="https://www.nbcnews.com/business", include_raw_html=False)
|
result = crawler.run(url="https://www.nbcnews.com/business", include_raw_html=False)
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### Result Structure
|
||||||
|
|
||||||
|
The result object contains the following fields:
|
||||||
|
```python
|
||||||
|
class CrawlResult(BaseModel):
|
||||||
|
url: str
|
||||||
|
html: str
|
||||||
|
success: bool
|
||||||
|
cleaned_html: Optional[str] = None
|
||||||
|
media: Dict[str, List[Dict]] = {} # Media tags in the page {"images": [], "audio": [], "video": []}
|
||||||
|
links: Dict[str, List[Dict]] = {} # Links in the page {"external": [], "internal": []}
|
||||||
|
screenshot: Optional[str] = None # Base64 encoded screenshot
|
||||||
|
markdown: Optional[str] = None
|
||||||
|
extracted_content: Optional[str] = None
|
||||||
|
metadata: Optional[dict] = None
|
||||||
|
error_message: Optional[str] = None
|
||||||
|
```
|
||||||
|
|
||||||
### Taking Screenshots
|
### Taking Screenshots
|
||||||
|
|
||||||
```python
|
```python
|
||||||
@@ -401,6 +425,7 @@ result = crawler.run(url="https://www.nbcnews.com/business")
|
|||||||
| `extraction_strategy` | The strategy to use for extracting content from the HTML (e.g., "CosineStrategy"). | No | `NoExtractionStrategy` |
|
| `extraction_strategy` | The strategy to use for extracting content from the HTML (e.g., "CosineStrategy"). | No | `NoExtractionStrategy` |
|
||||||
| `chunking_strategy` | The strategy to use for chunking the text before processing (e.g., "RegexChunking"). | No | `RegexChunking` |
|
| `chunking_strategy` | The strategy to use for chunking the text before processing (e.g., "RegexChunking"). | No | `RegexChunking` |
|
||||||
| `css_selector` | The CSS selector to target specific parts of the HTML for extraction. | No | `None` |
|
| `css_selector` | The CSS selector to target specific parts of the HTML for extraction. | No | `None` |
|
||||||
|
| `user_agent` | The user agent to use for the HTTP requests. | No | `Mozilla/5.0` |
|
||||||
| `verbose` | Whether to enable verbose logging. | No | `true` |
|
| `verbose` | Whether to enable verbose logging. | No | `true` |
|
||||||
|
|
||||||
## Chunking Strategies 📚
|
## Chunking Strategies 📚
|
||||||
|
|||||||
@@ -45,6 +45,10 @@ class CrawlerStrategy(ABC):
|
|||||||
def take_screenshot(self, save_path: str):
|
def take_screenshot(self, save_path: str):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def update_user_agent(self, user_agent: str):
|
||||||
|
pass
|
||||||
|
|
||||||
class CloudCrawlerStrategy(CrawlerStrategy):
|
class CloudCrawlerStrategy(CrawlerStrategy):
|
||||||
def __init__(self, use_cached_html = False):
|
def __init__(self, use_cached_html = False):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
@@ -69,6 +73,8 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
|
|||||||
print("[LOG] 🚀 Initializing LocalSeleniumCrawlerStrategy")
|
print("[LOG] 🚀 Initializing LocalSeleniumCrawlerStrategy")
|
||||||
self.options = Options()
|
self.options = Options()
|
||||||
self.options.headless = True
|
self.options.headless = True
|
||||||
|
if kwargs.get("user_agent"):
|
||||||
|
self.options.add_argument("--user-agent=" + kwargs.get("user_agent"))
|
||||||
self.options.add_argument("--no-sandbox")
|
self.options.add_argument("--no-sandbox")
|
||||||
self.options.add_argument("--headless")
|
self.options.add_argument("--headless")
|
||||||
# self.options.add_argument("--disable-dev-shm-usage")
|
# self.options.add_argument("--disable-dev-shm-usage")
|
||||||
@@ -97,6 +103,11 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
|
|||||||
self.service.log_path = "NUL"
|
self.service.log_path = "NUL"
|
||||||
self.driver = webdriver.Chrome(service=self.service, options=self.options)
|
self.driver = webdriver.Chrome(service=self.service, options=self.options)
|
||||||
|
|
||||||
|
def update_user_agent(self, user_agent: str):
|
||||||
|
self.options.add_argument(f"user-agent={user_agent}")
|
||||||
|
self.driver.quit()
|
||||||
|
self.driver = webdriver.Chrome(service=self.service, options=self.options)
|
||||||
|
|
||||||
def crawl(self, url: str) -> str:
|
def crawl(self, url: str) -> str:
|
||||||
if self.use_cached_html:
|
if self.use_cached_html:
|
||||||
cache_file_path = os.path.join(Path.home(), ".crawl4ai", "cache", url.replace("/", "_"))
|
cache_file_path = os.path.join(Path.home(), ".crawl4ai", "cache", url.replace("/", "_"))
|
||||||
|
|||||||
@@ -21,6 +21,7 @@ def init_db():
|
|||||||
success BOOLEAN,
|
success BOOLEAN,
|
||||||
media TEXT DEFAULT "{}",
|
media TEXT DEFAULT "{}",
|
||||||
link TEXT DEFAULT "{}",
|
link TEXT DEFAULT "{}",
|
||||||
|
metadata TEXT DEFAULT "{}",
|
||||||
screenshot TEXT DEFAULT ""
|
screenshot TEXT DEFAULT ""
|
||||||
)
|
)
|
||||||
''')
|
''')
|
||||||
@@ -42,12 +43,12 @@ def check_db_path():
|
|||||||
if not DB_PATH:
|
if not DB_PATH:
|
||||||
raise ValueError("Database path is not set or is empty.")
|
raise ValueError("Database path is not set or is empty.")
|
||||||
|
|
||||||
def get_cached_url(url: str) -> Optional[Tuple[str, str, str, str, str, str, bool, str]]:
|
def get_cached_url(url: str) -> Optional[Tuple[str, str, str, str, str, str, str, bool, str]]:
|
||||||
check_db_path()
|
check_db_path()
|
||||||
try:
|
try:
|
||||||
conn = sqlite3.connect(DB_PATH)
|
conn = sqlite3.connect(DB_PATH)
|
||||||
cursor = conn.cursor()
|
cursor = conn.cursor()
|
||||||
cursor.execute('SELECT url, html, cleaned_html, markdown, extracted_content, success, media, links, screenshot FROM crawled_data WHERE url = ?', (url,))
|
cursor.execute('SELECT url, html, cleaned_html, markdown, extracted_content, success, media, links, metadata, screenshot FROM crawled_data WHERE url = ?', (url,))
|
||||||
result = cursor.fetchone()
|
result = cursor.fetchone()
|
||||||
conn.close()
|
conn.close()
|
||||||
return result
|
return result
|
||||||
@@ -55,14 +56,14 @@ def get_cached_url(url: str) -> Optional[Tuple[str, str, str, str, str, str, boo
|
|||||||
print(f"Error retrieving cached URL: {e}")
|
print(f"Error retrieving cached URL: {e}")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def cache_url(url: str, html: str, cleaned_html: str, markdown: str, extracted_content: str, success: bool, media : str = "{}", links : str = "{}", screenshot: str = ""):
|
def cache_url(url: str, html: str, cleaned_html: str, markdown: str, extracted_content: str, success: bool, media : str = "{}", links : str = "{}", metadata : str = "{}", screenshot: str = ""):
|
||||||
check_db_path()
|
check_db_path()
|
||||||
try:
|
try:
|
||||||
conn = sqlite3.connect(DB_PATH)
|
conn = sqlite3.connect(DB_PATH)
|
||||||
cursor = conn.cursor()
|
cursor = conn.cursor()
|
||||||
cursor.execute('''
|
cursor.execute('''
|
||||||
INSERT INTO crawled_data (url, html, cleaned_html, markdown, extracted_content, success, media, links, screenshot)
|
INSERT INTO crawled_data (url, html, cleaned_html, markdown, extracted_content, success, media, links, metadata, screenshot)
|
||||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
|
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||||
ON CONFLICT(url) DO UPDATE SET
|
ON CONFLICT(url) DO UPDATE SET
|
||||||
html = excluded.html,
|
html = excluded.html,
|
||||||
cleaned_html = excluded.cleaned_html,
|
cleaned_html = excluded.cleaned_html,
|
||||||
@@ -71,8 +72,9 @@ def cache_url(url: str, html: str, cleaned_html: str, markdown: str, extracted_c
|
|||||||
success = excluded.success,
|
success = excluded.success,
|
||||||
media = excluded.media,
|
media = excluded.media,
|
||||||
links = excluded.links,
|
links = excluded.links,
|
||||||
|
metadata = excluded.metadata,
|
||||||
screenshot = excluded.screenshot
|
screenshot = excluded.screenshot
|
||||||
''', (url, html, cleaned_html, markdown, extracted_content, success, media, links, screenshot))
|
''', (url, html, cleaned_html, markdown, extracted_content, success, media, links, metadata, screenshot))
|
||||||
conn.commit()
|
conn.commit()
|
||||||
conn.close()
|
conn.close()
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
@@ -126,5 +128,5 @@ def update_existing_records(new_column: str = "media", default_value: str = "{}"
|
|||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
init_db() # Initialize the database if not already initialized
|
init_db() # Initialize the database if not already initialized
|
||||||
alter_db_add_screenshot("links") # Add the new column to the table
|
alter_db_add_screenshot("metadata") # Add the new column to the table
|
||||||
update_existing_records("links") # Update existing records to set the new column to an empty string
|
update_existing_records("metadata") # Update existing records to set the new column to an empty string
|
||||||
|
|||||||
@@ -359,6 +359,47 @@ def get_content_of_website(url, html, word_count_threshold = MIN_WORD_THRESHOLD,
|
|||||||
print('Error processing HTML content:', str(e))
|
print('Error processing HTML content:', str(e))
|
||||||
raise InvalidCSSSelectorError(f"Invalid CSS selector: {css_selector}") from e
|
raise InvalidCSSSelectorError(f"Invalid CSS selector: {css_selector}") from e
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def extract_metadata(html):
|
||||||
|
metadata = {}
|
||||||
|
|
||||||
|
if not html:
|
||||||
|
return metadata
|
||||||
|
|
||||||
|
# Parse HTML content with BeautifulSoup
|
||||||
|
soup = BeautifulSoup(html, 'html.parser')
|
||||||
|
|
||||||
|
# Title
|
||||||
|
title_tag = soup.find('title')
|
||||||
|
metadata['title'] = title_tag.string if title_tag else None
|
||||||
|
|
||||||
|
# Meta description
|
||||||
|
description_tag = soup.find('meta', attrs={'name': 'description'})
|
||||||
|
metadata['description'] = description_tag['content'] if description_tag else None
|
||||||
|
|
||||||
|
# Meta keywords
|
||||||
|
keywords_tag = soup.find('meta', attrs={'name': 'keywords'})
|
||||||
|
metadata['keywords'] = keywords_tag['content'] if keywords_tag else None
|
||||||
|
|
||||||
|
# Meta author
|
||||||
|
author_tag = soup.find('meta', attrs={'name': 'author'})
|
||||||
|
metadata['author'] = author_tag['content'] if author_tag else None
|
||||||
|
|
||||||
|
# Open Graph metadata
|
||||||
|
og_tags = soup.find_all('meta', attrs={'property': lambda value: value and value.startswith('og:')})
|
||||||
|
for tag in og_tags:
|
||||||
|
property_name = tag['property']
|
||||||
|
metadata[property_name] = tag['content']
|
||||||
|
|
||||||
|
# Twitter Card metadata
|
||||||
|
twitter_tags = soup.find_all('meta', attrs={'name': lambda value: value and value.startswith('twitter:')})
|
||||||
|
for tag in twitter_tags:
|
||||||
|
property_name = tag['name']
|
||||||
|
metadata[property_name] = tag['content']
|
||||||
|
|
||||||
|
return metadata
|
||||||
|
|
||||||
def extract_xml_tags(string):
|
def extract_xml_tags(string):
|
||||||
tags = re.findall(r'<(\w+)>', string)
|
tags = re.findall(r'<(\w+)>', string)
|
||||||
return list(set(tags))
|
return list(set(tags))
|
||||||
|
|||||||
@@ -89,8 +89,11 @@ class WebCrawler:
|
|||||||
css_selector: str = None,
|
css_selector: str = None,
|
||||||
screenshot: bool = False,
|
screenshot: bool = False,
|
||||||
verbose=True,
|
verbose=True,
|
||||||
|
user_agent: str = None,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
) -> CrawlResult:
|
) -> CrawlResult:
|
||||||
|
if user_agent:
|
||||||
|
self.crawler_strategy.update_user_agent(user_agent)
|
||||||
extraction_strategy = extraction_strategy or NoExtractionStrategy()
|
extraction_strategy = extraction_strategy or NoExtractionStrategy()
|
||||||
extraction_strategy.verbose = verbose
|
extraction_strategy.verbose = verbose
|
||||||
# Check if extraction strategy is an instance of ExtractionStrategy if not raise an error
|
# Check if extraction strategy is an instance of ExtractionStrategy if not raise an error
|
||||||
@@ -117,7 +120,8 @@ class WebCrawler:
|
|||||||
"success": cached[5],
|
"success": cached[5],
|
||||||
"media": json.loads(cached[6] or "{}"),
|
"media": json.loads(cached[6] or "{}"),
|
||||||
"links": json.loads(cached[7] or "{}"),
|
"links": json.loads(cached[7] or "{}"),
|
||||||
"screenshot": cached[8],
|
"metadata": json.loads(cached[8] or "{}"), # "metadata": "{}
|
||||||
|
"screenshot": cached[9],
|
||||||
"error_message": "",
|
"error_message": "",
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
@@ -135,6 +139,7 @@ class WebCrawler:
|
|||||||
# Extract content from HTML
|
# Extract content from HTML
|
||||||
try:
|
try:
|
||||||
result = get_content_of_website(url, html, word_count_threshold, css_selector=css_selector)
|
result = get_content_of_website(url, html, word_count_threshold, css_selector=css_selector)
|
||||||
|
metadata = extract_metadata(html)
|
||||||
if result is None:
|
if result is None:
|
||||||
raise ValueError(f"Failed to extract content from the website: {url}")
|
raise ValueError(f"Failed to extract content from the website: {url}")
|
||||||
except InvalidCSSSelectorError as e:
|
except InvalidCSSSelectorError as e:
|
||||||
@@ -180,6 +185,7 @@ class WebCrawler:
|
|||||||
success,
|
success,
|
||||||
json.dumps(media),
|
json.dumps(media),
|
||||||
json.dumps(links),
|
json.dumps(links),
|
||||||
|
json.dumps(metadata),
|
||||||
screenshot=base64_image,
|
screenshot=base64_image,
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -190,6 +196,7 @@ class WebCrawler:
|
|||||||
markdown=markdown,
|
markdown=markdown,
|
||||||
media=media,
|
media=media,
|
||||||
links=links,
|
links=links,
|
||||||
|
metadata=metadata,
|
||||||
screenshot=base64_image,
|
screenshot=base64_image,
|
||||||
extracted_content=extracted_content,
|
extracted_content=extracted_content,
|
||||||
success=success,
|
success=success,
|
||||||
|
|||||||
Binary file not shown.
|
Before Width: | Height: | Size: 344 KiB After Width: | Height: | Size: 372 KiB |
BIN
docs/examples/assets/css_js.png
Normal file
BIN
docs/examples/assets/css_js.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 537 KiB |
BIN
docs/examples/assets/semantic_extraction_cosine.png
Normal file
BIN
docs/examples/assets/semantic_extraction_cosine.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 419 KiB |
BIN
docs/examples/assets/semantic_extraction_llm.png
Normal file
BIN
docs/examples/assets/semantic_extraction_llm.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 485 KiB |
@@ -1,75 +1,64 @@
|
|||||||
|
|
||||||
import requests, base64, os
|
import requests, base64, os
|
||||||
|
|
||||||
|
data = {
|
||||||
|
"urls": ["https://www.nbcnews.com/business"],
|
||||||
|
"screenshot": True,
|
||||||
|
}
|
||||||
|
|
||||||
|
response = requests.post("https://crawl4ai.com/crawl", json=data)
|
||||||
|
result = response.json()['results'][0]
|
||||||
|
print(result.keys())
|
||||||
|
# dict_keys(['url', 'html', 'success', 'cleaned_html', 'media',
|
||||||
|
# 'links', 'screenshot', 'markdown', 'extracted_content',
|
||||||
|
# 'metadata', 'error_message'])
|
||||||
|
with open("screenshot.png", "wb") as f:
|
||||||
|
f.write(base64.b64decode(result['screenshot']))
|
||||||
|
|
||||||
|
# Example of filtering the content using CSS selectors
|
||||||
|
data = {
|
||||||
|
"urls": [
|
||||||
|
"https://www.nbcnews.com/business"
|
||||||
|
],
|
||||||
|
"css_selector": "article",
|
||||||
|
"screenshot": True,
|
||||||
|
}
|
||||||
|
|
||||||
|
# Example of executing a JS script on the page before extracting the content
|
||||||
data = {
|
data = {
|
||||||
"urls": [
|
"urls": [
|
||||||
"https://www.nbcnews.com/business"
|
"https://www.nbcnews.com/business"
|
||||||
],
|
],
|
||||||
"screenshot": True,
|
"screenshot": True,
|
||||||
|
'js' : ["""
|
||||||
|
const loadMoreButton = Array.from(document.querySelectorAll('button')).
|
||||||
|
find(button => button.textContent.includes('Load More'));
|
||||||
|
loadMoreButton && loadMoreButton.click();
|
||||||
|
"""]
|
||||||
}
|
}
|
||||||
|
|
||||||
# Example of filtering the content using CSS selectors
|
|
||||||
# data = {
|
|
||||||
# "urls": [
|
|
||||||
# "https://www.nbcnews.com/business"
|
|
||||||
# ],
|
|
||||||
# "css_selector": "article",
|
|
||||||
# "screenshot": True,
|
|
||||||
# }
|
|
||||||
|
|
||||||
# Example of executing a JS script on the page before extracting the content
|
|
||||||
# data = {
|
|
||||||
# "urls": [
|
|
||||||
# "https://www.nbcnews.com/business"
|
|
||||||
# ],
|
|
||||||
# "screenshot": True,
|
|
||||||
# 'js' : ["""
|
|
||||||
# const loadMoreButton = Array.from(document.querySelectorAll('button')).
|
|
||||||
# find(button => button.textContent.includes('Load More'));
|
|
||||||
# loadMoreButton && loadMoreButton.click();
|
|
||||||
# """]
|
|
||||||
# }
|
|
||||||
|
|
||||||
# Example of using a custom extraction strategy
|
# Example of using a custom extraction strategy
|
||||||
# data = {
|
data = {
|
||||||
# "urls": [
|
"urls": [
|
||||||
# "https://www.nbcnews.com/business"
|
"https://www.nbcnews.com/business"
|
||||||
# ],
|
],
|
||||||
# "extraction_strategy": "CosineStrategy",
|
"extraction_strategy": "CosineStrategy",
|
||||||
# "extraction_strategy_args": {
|
"extraction_strategy_args": {
|
||||||
# "semantic_filter": "inflation rent prices"
|
"semantic_filter": "inflation rent prices"
|
||||||
# },
|
},
|
||||||
# }
|
}
|
||||||
|
|
||||||
# Example of using LLM to extract content
|
# Example of using LLM to extract content
|
||||||
# data = {
|
data = {
|
||||||
# "urls": [
|
"urls": [
|
||||||
# "https://www.nbcnews.com/business"
|
"https://www.nbcnews.com/business"
|
||||||
# ],
|
],
|
||||||
# "extraction_strategy": "LLMExtractionStrategy",
|
"extraction_strategy": "LLMExtractionStrategy",
|
||||||
# "extraction_strategy_args": {
|
"extraction_strategy_args": {
|
||||||
# "provider": "groq/llama3-8b-8192",
|
"provider": "groq/llama3-8b-8192",
|
||||||
# "api_token": os.environ.get("GROQ_API_KEY"),
|
"api_token": os.environ.get("GROQ_API_KEY"),
|
||||||
# "instruction": """I am interested in only financial news,
|
"instruction": """I am interested in only financial news,
|
||||||
# and translate them in French."""
|
and translate them in French."""
|
||||||
# },
|
},
|
||||||
# }
|
}
|
||||||
|
|
||||||
response = requests.post("https://crawl4ai.com/crawl", json=data)
|
|
||||||
result = response.json()['results'][0]
|
|
||||||
|
|
||||||
print(result['markdown'])
|
|
||||||
print(result['cleaned_html'])
|
|
||||||
print(result['media'])
|
|
||||||
print(result['extracted_content'])
|
|
||||||
with open("screenshot.png", "wb") as f:
|
|
||||||
f.write(base64.b64decode(result['screenshot']))
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
2
main.py
2
main.py
@@ -57,6 +57,7 @@ class CrawlRequest(BaseModel):
|
|||||||
chunking_strategy_args: Optional[dict] = {}
|
chunking_strategy_args: Optional[dict] = {}
|
||||||
css_selector: Optional[str] = None
|
css_selector: Optional[str] = None
|
||||||
screenshot: Optional[bool] = False
|
screenshot: Optional[bool] = False
|
||||||
|
user_agent: Optional[str] = None
|
||||||
verbose: Optional[bool] = True
|
verbose: Optional[bool] = True
|
||||||
|
|
||||||
|
|
||||||
@@ -127,6 +128,7 @@ async def crawl_urls(crawl_request: CrawlRequest, request: Request):
|
|||||||
crawl_request.bypass_cache,
|
crawl_request.bypass_cache,
|
||||||
crawl_request.css_selector,
|
crawl_request.css_selector,
|
||||||
crawl_request.screenshot,
|
crawl_request.screenshot,
|
||||||
|
crawl_request.user_agent,
|
||||||
crawl_request.verbose
|
crawl_request.verbose
|
||||||
)
|
)
|
||||||
for url in crawl_request.urls
|
for url in crawl_request.urls
|
||||||
|
|||||||
Reference in New Issue
Block a user