perf(crawler): major performance improvements & raw HTML support

- Switch to lxml parser (~4x speedup)
- Add raw HTML & local file crawling support
- Fix cache headers & async cleanup
- Add browser process monitoring
- Optimize BeautifulSoup operations
- Pre-compile regex patterns

Breaking: Raw HTML handling requires new URL prefixes
Fixes: #256, #253
This commit is contained in:
UncleCode
2024-11-13 19:40:40 +08:00
parent 61b93ebf36
commit c38ac29edb
11 changed files with 2953 additions and 130 deletions

View File

@@ -84,7 +84,7 @@ class ManagedBrowser:
print(f"STDOUT: {stdout.decode()}")
print(f"STDERR: {stderr.decode()}")
await self.cleanup()
def _get_browser_path(self) -> str:
"""Returns the browser executable path based on OS and browser type"""
if sys.platform == "darwin": # macOS
@@ -493,6 +493,75 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
return page
async def crawl(self, url: str, **kwargs) -> AsyncCrawlResponse:
"""
Crawls a given URL or processes raw HTML/local file content based on the URL prefix.
Args:
url (str): The URL to crawl. Supported prefixes:
- 'http://' or 'https://': Web URL to crawl.
- 'file://': Local file path to process.
- 'raw:': Raw HTML content to process.
**kwargs: Additional parameters:
- 'screenshot' (bool): Whether to take a screenshot.
- ... [other existing parameters]
Returns:
AsyncCrawlResponse: The response containing HTML, headers, status code, and optional screenshot.
"""
response_headers = {}
status_code = 200 # Default to 200 for local/raw HTML
screenshot_requested = kwargs.get('screenshot', False)
screenshot_data = None
if url.startswith(('http://', 'https://')):
# Proceed with standard web crawling
return await self._crawl_web(url, **kwargs)
elif url.startswith('file://'):
# Process local file
local_file_path = url[7:] # Remove 'file://' prefix
if not os.path.exists(local_file_path):
raise FileNotFoundError(f"Local file not found: {local_file_path}")
with open(local_file_path, 'r', encoding='utf-8') as f:
html = f.read()
if screenshot_requested:
screenshot_data = await self._generate_screenshot_from_html(html)
return AsyncCrawlResponse(
html=html,
response_headers=response_headers,
status_code=status_code,
screenshot=screenshot_data,
get_delayed_content=None
)
elif url.startswith('raw:'):
# Process raw HTML content
raw_html = url[4:] # Remove 'raw:' prefix
html = raw_html
if screenshot_requested:
screenshot_data = await self._generate_screenshot_from_html(html)
return AsyncCrawlResponse(
html=html,
response_headers=response_headers,
status_code=status_code,
screenshot=screenshot_data,
get_delayed_content=None
)
else:
raise ValueError("URL must start with 'http://', 'https://', 'file://', or 'raw:'")
async def _crawl_web(self, url: str, **kwargs) -> AsyncCrawlResponse:
"""
Existing web crawling logic remains unchanged.
Args:
url (str): The web URL to crawl.
**kwargs: Additional parameters.
Returns:
AsyncCrawlResponse: The response containing HTML, headers, status code, and optional screenshot.
"""
response_headers = {}
status_code = None
@@ -792,7 +861,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
if self.verbose:
print(f"[LOG] ✅ Crawled {url} successfully!")
if self.use_cached_html:
cache_file_path = os.path.join(
Path.home(), ".crawl4ai", "cache", hashlib.md5(url.encode()).hexdigest()
@@ -972,6 +1041,15 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
print(f"Warning: Failed to remove overlay elements: {str(e)}")
async def take_screenshot(self, page: Page) -> str:
"""
Takes a screenshot of the current page.
Args:
page (Page): The Playwright page instance
Returns:
str: Base64-encoded screenshot image
"""
try:
# The page is already loaded, just take the screenshot
screenshot = await page.screenshot(full_page=True)
@@ -991,4 +1069,36 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
return base64.b64encode(buffered.getvalue()).decode('utf-8')
finally:
await page.close()
async def _generate_screenshot_from_html(self, html: str) -> Optional[str]:
"""
Generates a screenshot from raw HTML content.
Args:
html (str): The HTML content to render and capture.
Returns:
Optional[str]: Base64-encoded screenshot image or an error image if failed.
"""
try:
if not self.browser:
await self.start()
page = await self.browser.new_page()
await page.set_content(html, wait_until='networkidle')
screenshot = await page.screenshot(full_page=True)
await page.close()
return base64.b64encode(screenshot).decode('utf-8')
except Exception as e:
error_message = f"Failed to take screenshot: {str(e)}"
print(error_message)
# Generate an error image
img = Image.new('RGB', (800, 600), color='black')
draw = ImageDraw.Draw(img)
font = ImageFont.load_default()
draw.text((10, 10), error_message, fill=(255, 255, 255), font=font)
buffered = BytesIO()
img.save(buffered, format="JPEG")
return base64.b64encode(buffered.getvalue()).decode('utf-8')

View File

@@ -5,6 +5,7 @@ import asyncio
from typing import Optional, Tuple, Dict
from contextlib import asynccontextmanager
import logging
import json # Added for serialization/deserialization
# Set up logging
logging.basicConfig(level=logging.INFO)
@@ -89,7 +90,8 @@ class AsyncDatabaseManager:
media TEXT DEFAULT "{}",
links TEXT DEFAULT "{}",
metadata TEXT DEFAULT "{}",
screenshot TEXT DEFAULT ""
screenshot TEXT DEFAULT "",
response_headers TEXT DEFAULT "{}" -- New column added
)
''')
@@ -105,26 +107,51 @@ class AsyncDatabaseManager:
column_names = await self.execute_with_retry(_check_columns)
for column in ['media', 'links', 'metadata', 'screenshot']:
# List of new columns to add
new_columns = ['media', 'links', 'metadata', 'screenshot', 'response_headers']
for column in new_columns:
if column not in column_names:
await self.aalter_db_add_column(column)
async def aalter_db_add_column(self, new_column: str):
"""Add new column to the database"""
async def _alter(db):
await db.execute(f'ALTER TABLE crawled_data ADD COLUMN {new_column} TEXT DEFAULT ""')
if new_column == 'response_headers':
await db.execute(f'ALTER TABLE crawled_data ADD COLUMN {new_column} TEXT DEFAULT "{{}}"')
else:
await db.execute(f'ALTER TABLE crawled_data ADD COLUMN {new_column} TEXT DEFAULT ""')
logger.info(f"Added column '{new_column}' to the database.")
await self.execute_with_retry(_alter)
async def aget_cached_url(self, url: str) -> Optional[Tuple[str, str, str, str, str, str, str, bool, str]]:
async def aget_cached_url(self, url: str) -> Optional[Tuple[str, str, str, str, str, bool, str, str, str, str]]:
"""Retrieve cached URL data"""
async def _get(db):
async with db.execute(
'SELECT url, html, cleaned_html, markdown, extracted_content, success, media, links, metadata, screenshot FROM crawled_data WHERE url = ?',
'''
SELECT url, html, cleaned_html, markdown, extracted_content, success, media, links, metadata, screenshot, response_headers
FROM crawled_data WHERE url = ?
''',
(url,)
) as cursor:
return await cursor.fetchone()
row = await cursor.fetchone()
if row:
# Deserialize JSON fields
return (
row[0], # url
row[1], # html
row[2], # cleaned_html
row[3], # markdown
row[4], # extracted_content
row[5], # success
json.loads(row[6] or '{}'), # media
json.loads(row[7] or '{}'), # links
json.loads(row[8] or '{}'), # metadata
row[9], # screenshot
json.loads(row[10] or '{}') # response_headers
)
return None
try:
return await self.execute_with_retry(_get)
@@ -132,12 +159,27 @@ class AsyncDatabaseManager:
logger.error(f"Error retrieving cached URL: {e}")
return None
async def acache_url(self, url: str, html: str, cleaned_html: str, markdown: str, extracted_content: str, success: bool, media: str = "{}", links: str = "{}", metadata: str = "{}", screenshot: str = ""):
async def acache_url(
self,
url: str,
html: str,
cleaned_html: str,
markdown: str,
extracted_content: str,
success: bool,
media: str = "{}",
links: str = "{}",
metadata: str = "{}",
screenshot: str = "",
response_headers: str = "{}" # New parameter added
):
"""Cache URL data with retry logic"""
async def _cache(db):
await db.execute('''
INSERT INTO crawled_data (url, html, cleaned_html, markdown, extracted_content, success, media, links, metadata, screenshot)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
INSERT INTO crawled_data (
url, html, cleaned_html, markdown, extracted_content, success, media, links, metadata, screenshot, response_headers
)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
ON CONFLICT(url) DO UPDATE SET
html = excluded.html,
cleaned_html = excluded.cleaned_html,
@@ -147,8 +189,9 @@ class AsyncDatabaseManager:
media = excluded.media,
links = excluded.links,
metadata = excluded.metadata,
screenshot = excluded.screenshot
''', (url, html, cleaned_html, markdown, extracted_content, success, media, links, metadata, screenshot))
screenshot = excluded.screenshot,
response_headers = excluded.response_headers -- Update response_headers
''', (url, html, cleaned_html, markdown, extracted_content, success, media, links, metadata, screenshot, response_headers))
try:
await self.execute_with_retry(_cache)
@@ -189,4 +232,4 @@ class AsyncDatabaseManager:
logger.error(f"Error flushing database: {e}")
# Create a singleton instance
async_db_manager = AsyncDatabaseManager()
async_db_manager = AsyncDatabaseManager()

View File

@@ -9,7 +9,7 @@ from .async_database import async_db_manager
from .chunking_strategy import *
from .extraction_strategy import *
from .async_crawler_strategy import AsyncCrawlerStrategy, AsyncPlaywrightCrawlerStrategy, AsyncCrawlResponse
from .content_scrapping_strategy import WebScrappingStrategy
from .content_scrapping_strategy import WebScrapingStrategy
from .config import MIN_WORD_THRESHOLD, IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD
from .utils import (
sanitize_input_encode,
@@ -47,17 +47,17 @@ class AsyncWebCrawler:
async def awarmup(self):
# Print a message for crawl4ai and its version
print(f"[LOG] 🚀 Crawl4AI {crawl4ai_version}")
if self.verbose:
print(f"[LOG] 🚀 Crawl4AI {crawl4ai_version}")
print("[LOG] 🌤️ Warming up the AsyncWebCrawler")
# await async_db_manager.ainit_db()
await async_db_manager.initialize()
# await self.arun(
# url="https://google.com/",
# word_count_threshold=5,
# bypass_cache=False,
# verbose=False,
# )
await self.arun(
url="https://google.com/",
word_count_threshold=5,
bypass_cache=False,
verbose=False,
)
self.ready = True
if self.verbose:
print("[LOG] 🌞 AsyncWebCrawler is ready to crawl")
@@ -75,6 +75,19 @@ class AsyncWebCrawler:
verbose=True,
**kwargs,
) -> CrawlResult:
"""
Runs the crawler for a single source: URL (web, local file, or raw HTML).
Args:
url (str): The URL to crawl. Supported prefixes:
- 'http://' or 'https://': Web URL to crawl.
- 'file://': Local file path to process.
- 'raw:': Raw HTML content to process.
... [other existing parameters]
Returns:
CrawlResult: The result of the crawling and processing.
"""
try:
extraction_strategy = extraction_strategy or NoExtractionStrategy()
extraction_strategy.verbose = verbose
@@ -89,8 +102,13 @@ class AsyncWebCrawler:
cached = None
screenshot_data = None
extracted_content = None
if not bypass_cache and not self.always_by_pass_cache:
is_web_url = url.startswith(('http://', 'https://'))
if is_web_url and not bypass_cache and not self.always_by_pass_cache:
cached = await async_db_manager.aget_cached_url(url)
# if not bypass_cache and not self.always_by_pass_cache:
# cached = await async_db_manager.aget_cached_url(url)
if kwargs.get("warmup", True) and not self.ready:
return None
@@ -117,25 +135,32 @@ class AsyncWebCrawler:
)
crawl_result = await self.aprocess_html(
url,
html,
extracted_content,
word_count_threshold,
extraction_strategy,
chunking_strategy,
css_selector,
screenshot_data,
verbose,
bool(cached),
url=url,
html=html,
extracted_content=extracted_content,
word_count_threshold=word_count_threshold,
extraction_strategy=extraction_strategy,
chunking_strategy=chunking_strategy,
css_selector=css_selector,
screenshot=screenshot_data,
verbose=verbose,
is_cached=bool(cached),
async_response=async_response,
bypass_cache=bypass_cache,
**kwargs,
)
crawl_result.status_code = async_response.status_code if async_response else 200
crawl_result.response_headers = async_response.response_headers if async_response else {}
if async_response:
crawl_result.status_code = async_response.status_code
crawl_result.response_headers = async_response.response_headers
else:
crawl_result.status_code = 200
crawl_result.response_headers = cached[10]
crawl_result.success = bool(html)
crawl_result.session_id = kwargs.get("session_id", None)
return crawl_result
except Exception as e:
if not hasattr(e, "msg"):
e.msg = str(e)
@@ -155,22 +180,40 @@ class AsyncWebCrawler:
verbose=True,
**kwargs,
) -> List[CrawlResult]:
tasks = [
self.arun(
url,
word_count_threshold,
extraction_strategy,
chunking_strategy,
bypass_cache,
css_selector,
screenshot,
user_agent,
verbose,
**kwargs
)
for url in urls
]
return await asyncio.gather(*tasks)
"""
Runs the crawler for multiple sources: URLs (web, local files, or raw HTML).
Args:
urls (List[str]): A list of URLs with supported prefixes:
- 'http://' or 'https://': Web URL to crawl.
- 'file://': Local file path to process.
- 'raw:': Raw HTML content to process.
... [other existing parameters]
Returns:
List[CrawlResult]: The results of the crawling and processing.
"""
semaphore_count = kwargs.get('semaphore_count', 5) # Adjust as needed
semaphore = asyncio.Semaphore(semaphore_count)
async def crawl_with_semaphore(url):
async with semaphore:
return await self.arun(
url,
word_count_threshold=word_count_threshold,
extraction_strategy=extraction_strategy,
chunking_strategy=chunking_strategy,
bypass_cache=bypass_cache,
css_selector=css_selector,
screenshot=screenshot,
user_agent=user_agent,
verbose=verbose,
**kwargs,
)
tasks = [crawl_with_semaphore(url) for url in urls]
results = await asyncio.gather(*tasks, return_exceptions=True)
return [result if not isinstance(result, Exception) else str(result) for result in results]
async def aprocess_html(
self,
@@ -184,13 +227,14 @@ class AsyncWebCrawler:
screenshot: str,
verbose: bool,
is_cached: bool,
async_response: Optional[AsyncCrawlResponse],
**kwargs,
) -> CrawlResult:
t = time.time()
# Extract content from HTML
try:
t1 = time.time()
scrapping_strategy = WebScrappingStrategy()
scrapping_strategy = WebScrapingStrategy()
# result = await scrapping_strategy.ascrap(
result = scrapping_strategy.scrap(
url,
@@ -245,6 +289,12 @@ class AsyncWebCrawler:
)
screenshot = None if not screenshot else screenshot
response_headers = "{}" # Default value
if async_response:
# Serialize response_headers dict to JSON string
response_headers = json.dumps(async_response.response_headers, ensure_ascii=False)
if not is_cached or kwargs.get("bypass_cache", False) or self.always_by_pass_cache:
await async_db_manager.acache_url(
@@ -258,6 +308,7 @@ class AsyncWebCrawler:
json.dumps(links),
json.dumps(metadata),
screenshot=screenshot,
response_headers=response_headers,
)
return CrawlResult(

View File

@@ -15,7 +15,7 @@ class ContentCleaningStrategy:
self.link_density_threshold = 0.2
self.max_dom_depth = 10 # To prevent excessive DOM traversal
def clean(self, clean_html: str) -> str:
def clean(self, clean_html: str, soup = None) -> str:
"""
Main function that takes cleaned HTML and returns super cleaned HTML.
@@ -28,18 +28,20 @@ class ContentCleaningStrategy:
try:
if not clean_html or not isinstance(clean_html, str):
return ''
soup = BeautifulSoup(clean_html, 'html.parser')
if not soup:
# soup = BeautifulSoup(clean_html, 'html.parser')
soup = BeautifulSoup(clean_html, 'lxml')
main_content = self.extract_main_content(soup)
if main_content:
super_clean_element = self.clean_element(main_content)
return str(super_clean_element)
return super_clean_element.encode_contents().decode('utf-8')
else:
return ''
except Exception:
# Handle exceptions silently or log them as needed
return ''
def extract_main_content(self, soup: BeautifulSoup) -> Optional[Tag]:
def extract_main_content(self, soup) -> Optional[Tag]:
"""
Identifies and extracts the main content element from the HTML.

View File

@@ -1,3 +1,4 @@
import re # Point 1: Pre-Compile Regular Expressions
from abc import ABC, abstractmethod
from typing import Dict, Any
from bs4 import BeautifulSoup
@@ -105,7 +106,39 @@ class CustomHTML2Text(HTML2Text):
return
super().handle_data(data, entity_char)
class ContentScrappingStrategy(ABC):
# Pre-compile regular expressions for Open Graph and Twitter metadata
OG_REGEX = re.compile(r'^og:')
TWITTER_REGEX = re.compile(r'^twitter:')
DIMENSION_REGEX = re.compile(r"(\d+)(\D*)")
# Function to parse image height/width value and units
def parse_dimension(dimension):
if dimension:
# match = re.match(r"(\d+)(\D*)", dimension)
match = DIMENSION_REGEX.match(dimension)
if match:
number = int(match.group(1))
unit = match.group(2) or 'px' # Default unit is 'px' if not specified
return number, unit
return None, None
# Fetch image file metadata to extract size and extension
def fetch_image_file_size(img, base_url):
#If src is relative path construct full URL, if not it may be CDN URL
img_url = urljoin(base_url,img.get('src'))
try:
response = requests.head(img_url)
if response.status_code == 200:
return response.headers.get('Content-Length',None)
else:
print(f"Failed to retrieve file size for {img_url}")
return None
except InvalidSchema as e:
return None
finally:
return
class ContentScrapingStrategy(ABC):
@abstractmethod
def scrap(self, url: str, html: str, **kwargs) -> Dict[str, Any]:
pass
@@ -114,7 +147,7 @@ class ContentScrappingStrategy(ABC):
async def ascrap(self, url: str, html: str, **kwargs) -> Dict[str, Any]:
pass
class WebScrappingStrategy(ContentScrappingStrategy):
class WebScrapingStrategy(ContentScrapingStrategy):
def scrap(self, url: str, html: str, **kwargs) -> Dict[str, Any]:
return self._get_content_of_website_optimized(url, html, is_async=False, **kwargs)
@@ -126,9 +159,16 @@ class WebScrappingStrategy(ContentScrappingStrategy):
if not html:
return None
soup = BeautifulSoup(html, 'html.parser')
# soup = BeautifulSoup(html, 'html.parser')
soup = BeautifulSoup(html, 'lxml')
body = soup.body
try:
meta = extract_metadata("", soup)
except Exception as e:
print('Error extracting metadata:', str(e))
meta = {}
image_description_min_word_threshold = kwargs.get('image_description_min_word_threshold', IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD)
@@ -187,31 +227,7 @@ class WebScrappingStrategy(ContentScrappingStrategy):
#Score an image for it's usefulness
def score_image_for_usefulness(img, base_url, index, images_count):
# Function to parse image height/width value and units
def parse_dimension(dimension):
if dimension:
match = re.match(r"(\d+)(\D*)", dimension)
if match:
number = int(match.group(1))
unit = match.group(2) or 'px' # Default unit is 'px' if not specified
return number, unit
return None, None
# Fetch image file metadata to extract size and extension
def fetch_image_file_size(img, base_url):
#If src is relative path construct full URL, if not it may be CDN URL
img_url = urljoin(base_url,img.get('src'))
try:
response = requests.head(img_url)
if response.status_code == 200:
return response.headers.get('Content-Length',None)
else:
print(f"Failed to retrieve file size for {img_url}")
return None
except InvalidSchema as e:
return None
finally:
return
image_height = img.get('height')
height_value, height_unit = parse_dimension(image_height)
@@ -294,7 +310,6 @@ class WebScrappingStrategy(ContentScrappingStrategy):
exclude_social_media_domains = SOCIAL_MEDIA_DOMAINS + kwargs.get('exclude_social_media_domains', [])
exclude_social_media_domains = list(set(exclude_social_media_domains))
try:
if element.name == 'a' and element.get('href'):
@@ -439,15 +454,7 @@ class WebScrappingStrategy(ContentScrappingStrategy):
except Exception as e:
print('Error processing element:', str(e))
return False
#process images by filtering and extracting contextual text from the page
# imgs = body.find_all('img')
# media['images'] = [
# result for result in
# (process_image(img, url, i, len(imgs)) for i, img in enumerate(imgs))
# if result is not None
# ]
process_element(body)
# Update the links dictionary with unique links
@@ -478,8 +485,9 @@ class WebScrappingStrategy(ContentScrappingStrategy):
# Replace base64 data with empty string
img['src'] = base64_pattern.sub('', src)
str_body = ""
try:
str(body)
str_body = body.encode_contents().decode('utf-8')
except Exception as e:
# Reset body to the original HTML
success = False
@@ -504,11 +512,12 @@ class WebScrappingStrategy(ContentScrappingStrategy):
# Append the error div to the body
body.body.append(error_div)
str_body = body.encode_contents().decode('utf-8')
print(f"[LOG] 😧 Error: After processing the crawled HTML and removing irrelevant tags, nothing was left in the page. Check the markdown for further details.")
cleaned_html = str(body).replace('\n\n', '\n').replace(' ', ' ')
cleaned_html = str_body.replace('\n\n', '\n').replace(' ', ' ')
try:
h = CustomHTML2Text()
@@ -518,15 +527,14 @@ class WebScrappingStrategy(ContentScrappingStrategy):
markdown = h.handle(sanitize_html(cleaned_html))
markdown = markdown.replace(' ```', '```')
try:
meta = extract_metadata(html, soup)
except Exception as e:
print('Error extracting metadata:', str(e))
meta = {}
cleaner = ContentCleaningStrategy()
fit_html = cleaner.clean(cleaned_html)
fit_markdown = h.handle(fit_html)
fit_markdown = "Set flag 'fit_markdown' to True to get cleaned HTML content."
fit_html = "Set flag 'fit_markdown' to True to get cleaned HTML content."
if kwargs.get('fit_markdown', False):
cleaner = ContentCleaningStrategy()
fit_html = cleaner.clean(cleaned_html)
fit_markdown = h.handle(fit_html)
cleaned_html = sanitize_html(cleaned_html)
return {

View File

@@ -736,46 +736,54 @@ def get_content_of_website_optimized(url: str, html: str, word_count_threshold:
'metadata': meta
}
def extract_metadata(html, soup = None):
def extract_metadata(html, soup=None):
metadata = {}
if not html:
if not html and not soup:
return {}
if not soup:
soup = BeautifulSoup(html, 'lxml')
head = soup.head
if not head:
return metadata
# Parse HTML content with BeautifulSoup
if not soup:
soup = BeautifulSoup(html, 'html.parser')
# Title
title_tag = soup.find('title')
metadata['title'] = title_tag.string if title_tag else None
title_tag = head.find('title')
metadata['title'] = title_tag.string.strip() if title_tag and title_tag.string else None
# Meta description
description_tag = soup.find('meta', attrs={'name': 'description'})
metadata['description'] = description_tag['content'] if description_tag else None
description_tag = head.find('meta', attrs={'name': 'description'})
metadata['description'] = description_tag.get('content', '').strip() if description_tag else None
# Meta keywords
keywords_tag = soup.find('meta', attrs={'name': 'keywords'})
metadata['keywords'] = keywords_tag['content'] if keywords_tag else None
keywords_tag = head.find('meta', attrs={'name': 'keywords'})
metadata['keywords'] = keywords_tag.get('content', '').strip() if keywords_tag else None
# Meta author
author_tag = soup.find('meta', attrs={'name': 'author'})
metadata['author'] = author_tag['content'] if author_tag else None
author_tag = head.find('meta', attrs={'name': 'author'})
metadata['author'] = author_tag.get('content', '').strip() if author_tag else None
# Open Graph metadata
og_tags = soup.find_all('meta', attrs={'property': lambda value: value and value.startswith('og:')})
og_tags = head.find_all('meta', attrs={'property': re.compile(r'^og:')})
for tag in og_tags:
property_name = tag['property']
metadata[property_name] = tag['content']
property_name = tag.get('property', '').strip()
content = tag.get('content', '').strip()
if property_name and content:
metadata[property_name] = content
# Twitter Card metadata
twitter_tags = soup.find_all('meta', attrs={'name': lambda value: value and value.startswith('twitter:')})
twitter_tags = head.find_all('meta', attrs={'name': re.compile(r'^twitter:')})
for tag in twitter_tags:
property_name = tag['name']
metadata[property_name] = tag['content']
property_name = tag.get('name', '').strip()
content = tag.get('content', '').strip()
if property_name and content:
metadata[property_name] = content
return metadata
def extract_xml_tags(string):
tags = re.findall(r'<(\w+)>', string)
return list(set(tags))

View File

@@ -10,7 +10,7 @@ from .extraction_strategy import *
from .crawler_strategy import *
from typing import List
from concurrent.futures import ThreadPoolExecutor
from .content_scrapping_strategy import WebScrappingStrategy
from .content_scrapping_strategy import WebScrapingStrategy
from .config import *
import warnings
import json
@@ -182,7 +182,7 @@ class WebCrawler:
# Extract content from HTML
try:
t1 = time.time()
scrapping_strategy = WebScrappingStrategy()
scrapping_strategy = WebScrapingStrategy()
extra_params = {k: v for k, v in kwargs.items() if k not in ["only_text", "image_description_min_word_threshold"]}
result = scrapping_strategy.scrap(
url,