Merge branch 'main' of https://github.com/unclecode/crawl4ai
This commit is contained in:
@@ -20,6 +20,7 @@ def init_db():
|
|||||||
extracted_content TEXT,
|
extracted_content TEXT,
|
||||||
success BOOLEAN,
|
success BOOLEAN,
|
||||||
media TEXT DEFAULT "{}",
|
media TEXT DEFAULT "{}",
|
||||||
|
link TEXT DEFAULT "{}",
|
||||||
screenshot TEXT DEFAULT ""
|
screenshot TEXT DEFAULT ""
|
||||||
)
|
)
|
||||||
''')
|
''')
|
||||||
@@ -41,12 +42,12 @@ def check_db_path():
|
|||||||
if not DB_PATH:
|
if not DB_PATH:
|
||||||
raise ValueError("Database path is not set or is empty.")
|
raise ValueError("Database path is not set or is empty.")
|
||||||
|
|
||||||
def get_cached_url(url: str) -> Optional[Tuple[str, str, str, str, str, bool, str]]:
|
def get_cached_url(url: str) -> Optional[Tuple[str, str, str, str, str, str, bool, str]]:
|
||||||
check_db_path()
|
check_db_path()
|
||||||
try:
|
try:
|
||||||
conn = sqlite3.connect(DB_PATH)
|
conn = sqlite3.connect(DB_PATH)
|
||||||
cursor = conn.cursor()
|
cursor = conn.cursor()
|
||||||
cursor.execute('SELECT url, html, cleaned_html, markdown, extracted_content, success, media, screenshot FROM crawled_data WHERE url = ?', (url,))
|
cursor.execute('SELECT url, html, cleaned_html, markdown, extracted_content, success, media, links, screenshot FROM crawled_data WHERE url = ?', (url,))
|
||||||
result = cursor.fetchone()
|
result = cursor.fetchone()
|
||||||
conn.close()
|
conn.close()
|
||||||
return result
|
return result
|
||||||
@@ -54,23 +55,24 @@ def get_cached_url(url: str) -> Optional[Tuple[str, str, str, str, str, bool, st
|
|||||||
print(f"Error retrieving cached URL: {e}")
|
print(f"Error retrieving cached URL: {e}")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def cache_url(url: str, html: str, cleaned_html: str, markdown: str, extracted_content: str, success: bool, media : str = "{}", screenshot: str = ""):
|
def cache_url(url: str, html: str, cleaned_html: str, markdown: str, extracted_content: str, success: bool, media : str = "{}", links : str = "{}", screenshot: str = ""):
|
||||||
check_db_path()
|
check_db_path()
|
||||||
try:
|
try:
|
||||||
conn = sqlite3.connect(DB_PATH)
|
conn = sqlite3.connect(DB_PATH)
|
||||||
cursor = conn.cursor()
|
cursor = conn.cursor()
|
||||||
cursor.execute('''
|
cursor.execute('''
|
||||||
INSERT INTO crawled_data (url, html, cleaned_html, markdown, extracted_content, success, screenshot)
|
INSERT INTO crawled_data (url, html, cleaned_html, markdown, extracted_content, success, media, links, screenshot)
|
||||||
VALUES (?, ?, ?, ?, ?, ?, ?)
|
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||||
ON CONFLICT(url) DO UPDATE SET
|
ON CONFLICT(url) DO UPDATE SET
|
||||||
html = excluded.html,
|
html = excluded.html,
|
||||||
cleaned_html = excluded.cleaned_html,
|
cleaned_html = excluded.cleaned_html,
|
||||||
markdown = excluded.markdown,
|
markdown = excluded.markdown,
|
||||||
extracted_content = excluded.extracted_content,
|
extracted_content = excluded.extracted_content,
|
||||||
success = excluded.success,
|
success = excluded.success,
|
||||||
media = excluded.media,
|
media = excluded.media,
|
||||||
|
links = excluded.links,
|
||||||
screenshot = excluded.screenshot
|
screenshot = excluded.screenshot
|
||||||
''', (url, html, cleaned_html, markdown, extracted_content, success, media, screenshot))
|
''', (url, html, cleaned_html, markdown, extracted_content, success, media, links, screenshot))
|
||||||
conn.commit()
|
conn.commit()
|
||||||
conn.close()
|
conn.close()
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
@@ -124,5 +126,5 @@ def update_existing_records(new_column: str = "media", default_value: str = "{}"
|
|||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
init_db() # Initialize the database if not already initialized
|
init_db() # Initialize the database if not already initialized
|
||||||
alter_db_add_screenshot() # Add the new column to the table
|
alter_db_add_screenshot("links") # Add the new column to the table
|
||||||
update_existing_records() # Update existing records to set the new column to an empty string
|
update_existing_records("links") # Update existing records to set the new column to an empty string
|
||||||
|
|||||||
@@ -11,6 +11,7 @@ class CrawlResult(BaseModel):
|
|||||||
success: bool
|
success: bool
|
||||||
cleaned_html: Optional[str] = None
|
cleaned_html: Optional[str] = None
|
||||||
media: Dict[str, List[Dict]] = {}
|
media: Dict[str, List[Dict]] = {}
|
||||||
|
links: Dict[str, List[Dict]] = {}
|
||||||
screenshot: Optional[str] = None
|
screenshot: Optional[str] = None
|
||||||
markdown: Optional[str] = None
|
markdown: Optional[str] = None
|
||||||
extracted_content: Optional[str] = None
|
extracted_content: Optional[str] = None
|
||||||
|
|||||||
@@ -151,7 +151,7 @@ class CustomHTML2Text(HTML2Text):
|
|||||||
|
|
||||||
super().handle_tag(tag, attrs, start)
|
super().handle_tag(tag, attrs, start)
|
||||||
|
|
||||||
def get_content_of_website(html, word_count_threshold = MIN_WORD_THRESHOLD, css_selector = None):
|
def get_content_of_website(url, html, word_count_threshold = MIN_WORD_THRESHOLD, css_selector = None):
|
||||||
try:
|
try:
|
||||||
if not html:
|
if not html:
|
||||||
return None
|
return None
|
||||||
@@ -170,6 +170,28 @@ def get_content_of_website(html, word_count_threshold = MIN_WORD_THRESHOLD, css_
|
|||||||
for el in selected_elements:
|
for el in selected_elements:
|
||||||
div_tag.append(el)
|
div_tag.append(el)
|
||||||
body = div_tag
|
body = div_tag
|
||||||
|
|
||||||
|
links = {
|
||||||
|
'internal': [],
|
||||||
|
'external': []
|
||||||
|
}
|
||||||
|
|
||||||
|
# Extract all internal and external links
|
||||||
|
for a in body.find_all('a', href=True):
|
||||||
|
href = a['href']
|
||||||
|
url_base = url.split('/')[2]
|
||||||
|
if href.startswith('http') and url_base not in href:
|
||||||
|
links['external'].append({
|
||||||
|
'href': href,
|
||||||
|
'text': a.get_text()
|
||||||
|
})
|
||||||
|
else:
|
||||||
|
links['internal'].append(
|
||||||
|
{
|
||||||
|
'href': href,
|
||||||
|
'text': a.get_text()
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
# Remove script, style, and other tags that don't carry useful content from body
|
# Remove script, style, and other tags that don't carry useful content from body
|
||||||
for tag in body.find_all(['script', 'style', 'link', 'meta', 'noscript']):
|
for tag in body.find_all(['script', 'style', 'link', 'meta', 'noscript']):
|
||||||
@@ -329,7 +351,8 @@ def get_content_of_website(html, word_count_threshold = MIN_WORD_THRESHOLD, css_
|
|||||||
'markdown': markdown,
|
'markdown': markdown,
|
||||||
'cleaned_html': cleaned_html,
|
'cleaned_html': cleaned_html,
|
||||||
'success': True,
|
'success': True,
|
||||||
'media': media
|
'media': media,
|
||||||
|
'links': links
|
||||||
}
|
}
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
|||||||
@@ -116,7 +116,8 @@ class WebCrawler:
|
|||||||
"extracted_content": cached[4],
|
"extracted_content": cached[4],
|
||||||
"success": cached[5],
|
"success": cached[5],
|
||||||
"media": json.loads(cached[6] or "{}"),
|
"media": json.loads(cached[6] or "{}"),
|
||||||
"screenshot": cached[7],
|
"links": json.loads(cached[7] or "{}"),
|
||||||
|
"screenshot": cached[8],
|
||||||
"error_message": "",
|
"error_message": "",
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
@@ -133,15 +134,16 @@ class WebCrawler:
|
|||||||
error_message = ""
|
error_message = ""
|
||||||
# Extract content from HTML
|
# Extract content from HTML
|
||||||
try:
|
try:
|
||||||
result = get_content_of_website(html, word_count_threshold, css_selector=css_selector)
|
result = get_content_of_website(url, html, word_count_threshold, css_selector=css_selector)
|
||||||
if result is None:
|
if result is None:
|
||||||
raise ValueError(f"Failed to extract content from the website: {url}")
|
raise ValueError(f"Failed to extract content from the website: {url}")
|
||||||
except InvalidCSSSelectorError as e:
|
except InvalidCSSSelectorError as e:
|
||||||
raise ValueError(str(e))
|
raise ValueError(str(e))
|
||||||
|
|
||||||
cleaned_html = result.get("cleaned_html", html)
|
cleaned_html = result.get("cleaned_html", "")
|
||||||
markdown = result.get("markdown", "")
|
markdown = result.get("markdown", "")
|
||||||
media = result.get("media", [])
|
media = result.get("media", [])
|
||||||
|
links = result.get("links", [])
|
||||||
|
|
||||||
# Print a profession LOG style message, show time taken and say crawling is done
|
# Print a profession LOG style message, show time taken and say crawling is done
|
||||||
if verbose:
|
if verbose:
|
||||||
@@ -177,6 +179,7 @@ class WebCrawler:
|
|||||||
extracted_content,
|
extracted_content,
|
||||||
success,
|
success,
|
||||||
json.dumps(media),
|
json.dumps(media),
|
||||||
|
json.dumps(links),
|
||||||
screenshot=base64_image,
|
screenshot=base64_image,
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -186,6 +189,7 @@ class WebCrawler:
|
|||||||
cleaned_html=cleaned_html,
|
cleaned_html=cleaned_html,
|
||||||
markdown=markdown,
|
markdown=markdown,
|
||||||
media=media,
|
media=media,
|
||||||
|
links=links,
|
||||||
screenshot=base64_image,
|
screenshot=base64_image,
|
||||||
extracted_content=extracted_content,
|
extracted_content=extracted_content,
|
||||||
success=success,
|
success=success,
|
||||||
@@ -229,3 +233,102 @@ class WebCrawler:
|
|||||||
)
|
)
|
||||||
|
|
||||||
return results
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
def run_less_db(
|
||||||
|
self,
|
||||||
|
url: str,
|
||||||
|
word_count_threshold=MIN_WORD_THRESHOLD,
|
||||||
|
extraction_strategy: ExtractionStrategy = None,
|
||||||
|
chunking_strategy: ChunkingStrategy = RegexChunking(),
|
||||||
|
bypass_cache: bool = False,
|
||||||
|
css_selector: str = None,
|
||||||
|
screenshot: bool = False,
|
||||||
|
verbose=True,
|
||||||
|
**kwargs,
|
||||||
|
) -> CrawlResult:
|
||||||
|
extraction_strategy = extraction_strategy or NoExtractionStrategy()
|
||||||
|
extraction_strategy.verbose = verbose
|
||||||
|
if not isinstance(extraction_strategy, ExtractionStrategy):
|
||||||
|
raise ValueError("Unsupported extraction strategy")
|
||||||
|
if not isinstance(chunking_strategy, ChunkingStrategy):
|
||||||
|
raise ValueError("Unsupported chunking strategy")
|
||||||
|
|
||||||
|
if word_count_threshold < MIN_WORD_THRESHOLD:
|
||||||
|
word_count_threshold = MIN_WORD_THRESHOLD
|
||||||
|
|
||||||
|
# Check cache first
|
||||||
|
cached = None
|
||||||
|
extracted_content = None
|
||||||
|
if not bypass_cache and not self.always_by_pass_cache:
|
||||||
|
cached = get_cached_url(url)
|
||||||
|
|
||||||
|
if cached:
|
||||||
|
html = cached[1]
|
||||||
|
extracted_content = cached[2]
|
||||||
|
else:
|
||||||
|
html = self.crawler_strategy.crawl(url)
|
||||||
|
cache_url(url, html)
|
||||||
|
|
||||||
|
return self.process_html(url, html, extracted_content, word_count_threshold, extraction_strategy, chunking_strategy, css_selector, screenshot, verbose, **kwargs)
|
||||||
|
|
||||||
|
def process_html(
|
||||||
|
self,
|
||||||
|
url: str,
|
||||||
|
html: str,
|
||||||
|
extracted_content: str,
|
||||||
|
word_count_threshold: int,
|
||||||
|
extraction_strategy: ExtractionStrategy,
|
||||||
|
chunking_strategy: ChunkingStrategy,
|
||||||
|
css_selector: str,
|
||||||
|
screenshot: bool,
|
||||||
|
verbose: bool,
|
||||||
|
**kwargs,
|
||||||
|
) -> CrawlResult:
|
||||||
|
t = time.time()
|
||||||
|
base64_image = None
|
||||||
|
if screenshot:
|
||||||
|
base64_image = self.crawler_strategy.take_screenshot()
|
||||||
|
|
||||||
|
# Extract content from HTML
|
||||||
|
try:
|
||||||
|
result = get_content_of_website(url, html, word_count_threshold, css_selector=css_selector)
|
||||||
|
if result is None:
|
||||||
|
raise ValueError(f"Failed to extract content from the website: {url}")
|
||||||
|
except InvalidCSSSelectorError as e:
|
||||||
|
raise ValueError(str(e))
|
||||||
|
|
||||||
|
cleaned_html = result.get("cleaned_html", "")
|
||||||
|
markdown = result.get("markdown", "")
|
||||||
|
media = result.get("media", [])
|
||||||
|
links = result.get("links", [])
|
||||||
|
|
||||||
|
if verbose:
|
||||||
|
print(f"[LOG] 🚀 Crawling done for {url}, success: True, time taken: {time.time() - t} seconds")
|
||||||
|
|
||||||
|
if verbose:
|
||||||
|
print(f"[LOG] 🔥 Extracting semantic blocks for {url}, Strategy: {extraction_strategy.name}")
|
||||||
|
|
||||||
|
sections = chunking_strategy.chunk(markdown)
|
||||||
|
|
||||||
|
if extracted_content is None:
|
||||||
|
extracted_content = extraction_strategy.run(url, sections)
|
||||||
|
extracted_content = json.dumps(extracted_content)
|
||||||
|
# Cache the extracted content
|
||||||
|
cache_url(url, html, extracted_content)
|
||||||
|
|
||||||
|
if verbose:
|
||||||
|
print(f"[LOG] 🚀 Extraction done for {url}, time taken: {time.time() - t} seconds.")
|
||||||
|
|
||||||
|
return CrawlResult(
|
||||||
|
url=url,
|
||||||
|
html=html,
|
||||||
|
cleaned_html=cleaned_html,
|
||||||
|
markdown=markdown,
|
||||||
|
media=media,
|
||||||
|
links=links,
|
||||||
|
screenshot=base64_image,
|
||||||
|
extracted_content=extracted_content,
|
||||||
|
success=True,
|
||||||
|
error_message="",
|
||||||
|
)
|
||||||
BIN
docs/examples/assets/css_selector.png
Normal file
BIN
docs/examples/assets/css_selector.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 375 KiB |
@@ -8,6 +8,14 @@ data = {
|
|||||||
"screenshot": True,
|
"screenshot": True,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# Example of filtering the content using CSS selectors
|
||||||
|
# data = {
|
||||||
|
# "urls": [
|
||||||
|
# "https://www.nbcnews.com/business"
|
||||||
|
# ],
|
||||||
|
# "css_selector": "article",
|
||||||
|
# "screenshot": True,
|
||||||
|
# }
|
||||||
|
|
||||||
# Example of executing a JS script on the page before extracting the content
|
# Example of executing a JS script on the page before extracting the content
|
||||||
# data = {
|
# data = {
|
||||||
|
|||||||
Reference in New Issue
Block a user