perf(crawler): major performance improvements & raw HTML support
- Switch to lxml parser (~4x speedup) - Add raw HTML & local file crawling support - Fix cache headers & async cleanup - Add browser process monitoring - Optimize BeautifulSoup operations - Pre-compile regex patterns Breaking: Raw HTML handling requires new URL prefixes Fixes: #256, #253
This commit is contained in:
@@ -84,7 +84,7 @@ class ManagedBrowser:
|
||||
print(f"STDOUT: {stdout.decode()}")
|
||||
print(f"STDERR: {stderr.decode()}")
|
||||
await self.cleanup()
|
||||
|
||||
|
||||
def _get_browser_path(self) -> str:
|
||||
"""Returns the browser executable path based on OS and browser type"""
|
||||
if sys.platform == "darwin": # macOS
|
||||
@@ -493,6 +493,75 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
return page
|
||||
|
||||
async def crawl(self, url: str, **kwargs) -> AsyncCrawlResponse:
|
||||
"""
|
||||
Crawls a given URL or processes raw HTML/local file content based on the URL prefix.
|
||||
|
||||
Args:
|
||||
url (str): The URL to crawl. Supported prefixes:
|
||||
- 'http://' or 'https://': Web URL to crawl.
|
||||
- 'file://': Local file path to process.
|
||||
- 'raw:': Raw HTML content to process.
|
||||
**kwargs: Additional parameters:
|
||||
- 'screenshot' (bool): Whether to take a screenshot.
|
||||
- ... [other existing parameters]
|
||||
|
||||
Returns:
|
||||
AsyncCrawlResponse: The response containing HTML, headers, status code, and optional screenshot.
|
||||
"""
|
||||
response_headers = {}
|
||||
status_code = 200 # Default to 200 for local/raw HTML
|
||||
screenshot_requested = kwargs.get('screenshot', False)
|
||||
screenshot_data = None
|
||||
|
||||
if url.startswith(('http://', 'https://')):
|
||||
# Proceed with standard web crawling
|
||||
return await self._crawl_web(url, **kwargs)
|
||||
|
||||
elif url.startswith('file://'):
|
||||
# Process local file
|
||||
local_file_path = url[7:] # Remove 'file://' prefix
|
||||
if not os.path.exists(local_file_path):
|
||||
raise FileNotFoundError(f"Local file not found: {local_file_path}")
|
||||
with open(local_file_path, 'r', encoding='utf-8') as f:
|
||||
html = f.read()
|
||||
if screenshot_requested:
|
||||
screenshot_data = await self._generate_screenshot_from_html(html)
|
||||
return AsyncCrawlResponse(
|
||||
html=html,
|
||||
response_headers=response_headers,
|
||||
status_code=status_code,
|
||||
screenshot=screenshot_data,
|
||||
get_delayed_content=None
|
||||
)
|
||||
|
||||
elif url.startswith('raw:'):
|
||||
# Process raw HTML content
|
||||
raw_html = url[4:] # Remove 'raw:' prefix
|
||||
html = raw_html
|
||||
if screenshot_requested:
|
||||
screenshot_data = await self._generate_screenshot_from_html(html)
|
||||
return AsyncCrawlResponse(
|
||||
html=html,
|
||||
response_headers=response_headers,
|
||||
status_code=status_code,
|
||||
screenshot=screenshot_data,
|
||||
get_delayed_content=None
|
||||
)
|
||||
else:
|
||||
raise ValueError("URL must start with 'http://', 'https://', 'file://', or 'raw:'")
|
||||
|
||||
|
||||
async def _crawl_web(self, url: str, **kwargs) -> AsyncCrawlResponse:
|
||||
"""
|
||||
Existing web crawling logic remains unchanged.
|
||||
|
||||
Args:
|
||||
url (str): The web URL to crawl.
|
||||
**kwargs: Additional parameters.
|
||||
|
||||
Returns:
|
||||
AsyncCrawlResponse: The response containing HTML, headers, status code, and optional screenshot.
|
||||
"""
|
||||
response_headers = {}
|
||||
status_code = None
|
||||
|
||||
@@ -792,7 +861,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
|
||||
if self.verbose:
|
||||
print(f"[LOG] ✅ Crawled {url} successfully!")
|
||||
|
||||
|
||||
if self.use_cached_html:
|
||||
cache_file_path = os.path.join(
|
||||
Path.home(), ".crawl4ai", "cache", hashlib.md5(url.encode()).hexdigest()
|
||||
@@ -972,6 +1041,15 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
print(f"Warning: Failed to remove overlay elements: {str(e)}")
|
||||
|
||||
async def take_screenshot(self, page: Page) -> str:
|
||||
"""
|
||||
Takes a screenshot of the current page.
|
||||
|
||||
Args:
|
||||
page (Page): The Playwright page instance
|
||||
|
||||
Returns:
|
||||
str: Base64-encoded screenshot image
|
||||
"""
|
||||
try:
|
||||
# The page is already loaded, just take the screenshot
|
||||
screenshot = await page.screenshot(full_page=True)
|
||||
@@ -991,4 +1069,36 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
return base64.b64encode(buffered.getvalue()).decode('utf-8')
|
||||
finally:
|
||||
await page.close()
|
||||
|
||||
async def _generate_screenshot_from_html(self, html: str) -> Optional[str]:
|
||||
"""
|
||||
Generates a screenshot from raw HTML content.
|
||||
|
||||
Args:
|
||||
html (str): The HTML content to render and capture.
|
||||
|
||||
Returns:
|
||||
Optional[str]: Base64-encoded screenshot image or an error image if failed.
|
||||
"""
|
||||
try:
|
||||
if not self.browser:
|
||||
await self.start()
|
||||
page = await self.browser.new_page()
|
||||
await page.set_content(html, wait_until='networkidle')
|
||||
screenshot = await page.screenshot(full_page=True)
|
||||
await page.close()
|
||||
return base64.b64encode(screenshot).decode('utf-8')
|
||||
except Exception as e:
|
||||
error_message = f"Failed to take screenshot: {str(e)}"
|
||||
print(error_message)
|
||||
|
||||
# Generate an error image
|
||||
img = Image.new('RGB', (800, 600), color='black')
|
||||
draw = ImageDraw.Draw(img)
|
||||
font = ImageFont.load_default()
|
||||
draw.text((10, 10), error_message, fill=(255, 255, 255), font=font)
|
||||
|
||||
buffered = BytesIO()
|
||||
img.save(buffered, format="JPEG")
|
||||
return base64.b64encode(buffered.getvalue()).decode('utf-8')
|
||||
|
||||
|
||||
@@ -5,6 +5,7 @@ import asyncio
|
||||
from typing import Optional, Tuple, Dict
|
||||
from contextlib import asynccontextmanager
|
||||
import logging
|
||||
import json # Added for serialization/deserialization
|
||||
|
||||
# Set up logging
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
@@ -89,7 +90,8 @@ class AsyncDatabaseManager:
|
||||
media TEXT DEFAULT "{}",
|
||||
links TEXT DEFAULT "{}",
|
||||
metadata TEXT DEFAULT "{}",
|
||||
screenshot TEXT DEFAULT ""
|
||||
screenshot TEXT DEFAULT "",
|
||||
response_headers TEXT DEFAULT "{}" -- New column added
|
||||
)
|
||||
''')
|
||||
|
||||
@@ -105,26 +107,51 @@ class AsyncDatabaseManager:
|
||||
|
||||
column_names = await self.execute_with_retry(_check_columns)
|
||||
|
||||
for column in ['media', 'links', 'metadata', 'screenshot']:
|
||||
# List of new columns to add
|
||||
new_columns = ['media', 'links', 'metadata', 'screenshot', 'response_headers']
|
||||
|
||||
for column in new_columns:
|
||||
if column not in column_names:
|
||||
await self.aalter_db_add_column(column)
|
||||
|
||||
async def aalter_db_add_column(self, new_column: str):
|
||||
"""Add new column to the database"""
|
||||
async def _alter(db):
|
||||
await db.execute(f'ALTER TABLE crawled_data ADD COLUMN {new_column} TEXT DEFAULT ""')
|
||||
if new_column == 'response_headers':
|
||||
await db.execute(f'ALTER TABLE crawled_data ADD COLUMN {new_column} TEXT DEFAULT "{{}}"')
|
||||
else:
|
||||
await db.execute(f'ALTER TABLE crawled_data ADD COLUMN {new_column} TEXT DEFAULT ""')
|
||||
logger.info(f"Added column '{new_column}' to the database.")
|
||||
|
||||
await self.execute_with_retry(_alter)
|
||||
|
||||
async def aget_cached_url(self, url: str) -> Optional[Tuple[str, str, str, str, str, str, str, bool, str]]:
|
||||
async def aget_cached_url(self, url: str) -> Optional[Tuple[str, str, str, str, str, bool, str, str, str, str]]:
|
||||
"""Retrieve cached URL data"""
|
||||
async def _get(db):
|
||||
async with db.execute(
|
||||
'SELECT url, html, cleaned_html, markdown, extracted_content, success, media, links, metadata, screenshot FROM crawled_data WHERE url = ?',
|
||||
'''
|
||||
SELECT url, html, cleaned_html, markdown, extracted_content, success, media, links, metadata, screenshot, response_headers
|
||||
FROM crawled_data WHERE url = ?
|
||||
''',
|
||||
(url,)
|
||||
) as cursor:
|
||||
return await cursor.fetchone()
|
||||
row = await cursor.fetchone()
|
||||
if row:
|
||||
# Deserialize JSON fields
|
||||
return (
|
||||
row[0], # url
|
||||
row[1], # html
|
||||
row[2], # cleaned_html
|
||||
row[3], # markdown
|
||||
row[4], # extracted_content
|
||||
row[5], # success
|
||||
json.loads(row[6] or '{}'), # media
|
||||
json.loads(row[7] or '{}'), # links
|
||||
json.loads(row[8] or '{}'), # metadata
|
||||
row[9], # screenshot
|
||||
json.loads(row[10] or '{}') # response_headers
|
||||
)
|
||||
return None
|
||||
|
||||
try:
|
||||
return await self.execute_with_retry(_get)
|
||||
@@ -132,12 +159,27 @@ class AsyncDatabaseManager:
|
||||
logger.error(f"Error retrieving cached URL: {e}")
|
||||
return None
|
||||
|
||||
async def acache_url(self, url: str, html: str, cleaned_html: str, markdown: str, extracted_content: str, success: bool, media: str = "{}", links: str = "{}", metadata: str = "{}", screenshot: str = ""):
|
||||
async def acache_url(
|
||||
self,
|
||||
url: str,
|
||||
html: str,
|
||||
cleaned_html: str,
|
||||
markdown: str,
|
||||
extracted_content: str,
|
||||
success: bool,
|
||||
media: str = "{}",
|
||||
links: str = "{}",
|
||||
metadata: str = "{}",
|
||||
screenshot: str = "",
|
||||
response_headers: str = "{}" # New parameter added
|
||||
):
|
||||
"""Cache URL data with retry logic"""
|
||||
async def _cache(db):
|
||||
await db.execute('''
|
||||
INSERT INTO crawled_data (url, html, cleaned_html, markdown, extracted_content, success, media, links, metadata, screenshot)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||
INSERT INTO crawled_data (
|
||||
url, html, cleaned_html, markdown, extracted_content, success, media, links, metadata, screenshot, response_headers
|
||||
)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||
ON CONFLICT(url) DO UPDATE SET
|
||||
html = excluded.html,
|
||||
cleaned_html = excluded.cleaned_html,
|
||||
@@ -147,8 +189,9 @@ class AsyncDatabaseManager:
|
||||
media = excluded.media,
|
||||
links = excluded.links,
|
||||
metadata = excluded.metadata,
|
||||
screenshot = excluded.screenshot
|
||||
''', (url, html, cleaned_html, markdown, extracted_content, success, media, links, metadata, screenshot))
|
||||
screenshot = excluded.screenshot,
|
||||
response_headers = excluded.response_headers -- Update response_headers
|
||||
''', (url, html, cleaned_html, markdown, extracted_content, success, media, links, metadata, screenshot, response_headers))
|
||||
|
||||
try:
|
||||
await self.execute_with_retry(_cache)
|
||||
@@ -189,4 +232,4 @@ class AsyncDatabaseManager:
|
||||
logger.error(f"Error flushing database: {e}")
|
||||
|
||||
# Create a singleton instance
|
||||
async_db_manager = AsyncDatabaseManager()
|
||||
async_db_manager = AsyncDatabaseManager()
|
||||
|
||||
@@ -9,7 +9,7 @@ from .async_database import async_db_manager
|
||||
from .chunking_strategy import *
|
||||
from .extraction_strategy import *
|
||||
from .async_crawler_strategy import AsyncCrawlerStrategy, AsyncPlaywrightCrawlerStrategy, AsyncCrawlResponse
|
||||
from .content_scrapping_strategy import WebScrappingStrategy
|
||||
from .content_scrapping_strategy import WebScrapingStrategy
|
||||
from .config import MIN_WORD_THRESHOLD, IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD
|
||||
from .utils import (
|
||||
sanitize_input_encode,
|
||||
@@ -47,17 +47,17 @@ class AsyncWebCrawler:
|
||||
|
||||
async def awarmup(self):
|
||||
# Print a message for crawl4ai and its version
|
||||
print(f"[LOG] 🚀 Crawl4AI {crawl4ai_version}")
|
||||
if self.verbose:
|
||||
print(f"[LOG] 🚀 Crawl4AI {crawl4ai_version}")
|
||||
print("[LOG] 🌤️ Warming up the AsyncWebCrawler")
|
||||
# await async_db_manager.ainit_db()
|
||||
await async_db_manager.initialize()
|
||||
# await self.arun(
|
||||
# url="https://google.com/",
|
||||
# word_count_threshold=5,
|
||||
# bypass_cache=False,
|
||||
# verbose=False,
|
||||
# )
|
||||
await self.arun(
|
||||
url="https://google.com/",
|
||||
word_count_threshold=5,
|
||||
bypass_cache=False,
|
||||
verbose=False,
|
||||
)
|
||||
self.ready = True
|
||||
if self.verbose:
|
||||
print("[LOG] 🌞 AsyncWebCrawler is ready to crawl")
|
||||
@@ -75,6 +75,19 @@ class AsyncWebCrawler:
|
||||
verbose=True,
|
||||
**kwargs,
|
||||
) -> CrawlResult:
|
||||
"""
|
||||
Runs the crawler for a single source: URL (web, local file, or raw HTML).
|
||||
|
||||
Args:
|
||||
url (str): The URL to crawl. Supported prefixes:
|
||||
- 'http://' or 'https://': Web URL to crawl.
|
||||
- 'file://': Local file path to process.
|
||||
- 'raw:': Raw HTML content to process.
|
||||
... [other existing parameters]
|
||||
|
||||
Returns:
|
||||
CrawlResult: The result of the crawling and processing.
|
||||
"""
|
||||
try:
|
||||
extraction_strategy = extraction_strategy or NoExtractionStrategy()
|
||||
extraction_strategy.verbose = verbose
|
||||
@@ -89,8 +102,13 @@ class AsyncWebCrawler:
|
||||
cached = None
|
||||
screenshot_data = None
|
||||
extracted_content = None
|
||||
if not bypass_cache and not self.always_by_pass_cache:
|
||||
|
||||
is_web_url = url.startswith(('http://', 'https://'))
|
||||
if is_web_url and not bypass_cache and not self.always_by_pass_cache:
|
||||
cached = await async_db_manager.aget_cached_url(url)
|
||||
|
||||
# if not bypass_cache and not self.always_by_pass_cache:
|
||||
# cached = await async_db_manager.aget_cached_url(url)
|
||||
|
||||
if kwargs.get("warmup", True) and not self.ready:
|
||||
return None
|
||||
@@ -117,25 +135,32 @@ class AsyncWebCrawler:
|
||||
)
|
||||
|
||||
crawl_result = await self.aprocess_html(
|
||||
url,
|
||||
html,
|
||||
extracted_content,
|
||||
word_count_threshold,
|
||||
extraction_strategy,
|
||||
chunking_strategy,
|
||||
css_selector,
|
||||
screenshot_data,
|
||||
verbose,
|
||||
bool(cached),
|
||||
url=url,
|
||||
html=html,
|
||||
extracted_content=extracted_content,
|
||||
word_count_threshold=word_count_threshold,
|
||||
extraction_strategy=extraction_strategy,
|
||||
chunking_strategy=chunking_strategy,
|
||||
css_selector=css_selector,
|
||||
screenshot=screenshot_data,
|
||||
verbose=verbose,
|
||||
is_cached=bool(cached),
|
||||
async_response=async_response,
|
||||
bypass_cache=bypass_cache,
|
||||
**kwargs,
|
||||
)
|
||||
crawl_result.status_code = async_response.status_code if async_response else 200
|
||||
crawl_result.response_headers = async_response.response_headers if async_response else {}
|
||||
|
||||
if async_response:
|
||||
crawl_result.status_code = async_response.status_code
|
||||
crawl_result.response_headers = async_response.response_headers
|
||||
else:
|
||||
crawl_result.status_code = 200
|
||||
crawl_result.response_headers = cached[10]
|
||||
|
||||
crawl_result.success = bool(html)
|
||||
crawl_result.session_id = kwargs.get("session_id", None)
|
||||
return crawl_result
|
||||
|
||||
except Exception as e:
|
||||
if not hasattr(e, "msg"):
|
||||
e.msg = str(e)
|
||||
@@ -155,22 +180,40 @@ class AsyncWebCrawler:
|
||||
verbose=True,
|
||||
**kwargs,
|
||||
) -> List[CrawlResult]:
|
||||
tasks = [
|
||||
self.arun(
|
||||
url,
|
||||
word_count_threshold,
|
||||
extraction_strategy,
|
||||
chunking_strategy,
|
||||
bypass_cache,
|
||||
css_selector,
|
||||
screenshot,
|
||||
user_agent,
|
||||
verbose,
|
||||
**kwargs
|
||||
)
|
||||
for url in urls
|
||||
]
|
||||
return await asyncio.gather(*tasks)
|
||||
"""
|
||||
Runs the crawler for multiple sources: URLs (web, local files, or raw HTML).
|
||||
|
||||
Args:
|
||||
urls (List[str]): A list of URLs with supported prefixes:
|
||||
- 'http://' or 'https://': Web URL to crawl.
|
||||
- 'file://': Local file path to process.
|
||||
- 'raw:': Raw HTML content to process.
|
||||
... [other existing parameters]
|
||||
|
||||
Returns:
|
||||
List[CrawlResult]: The results of the crawling and processing.
|
||||
"""
|
||||
semaphore_count = kwargs.get('semaphore_count', 5) # Adjust as needed
|
||||
semaphore = asyncio.Semaphore(semaphore_count)
|
||||
|
||||
async def crawl_with_semaphore(url):
|
||||
async with semaphore:
|
||||
return await self.arun(
|
||||
url,
|
||||
word_count_threshold=word_count_threshold,
|
||||
extraction_strategy=extraction_strategy,
|
||||
chunking_strategy=chunking_strategy,
|
||||
bypass_cache=bypass_cache,
|
||||
css_selector=css_selector,
|
||||
screenshot=screenshot,
|
||||
user_agent=user_agent,
|
||||
verbose=verbose,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
tasks = [crawl_with_semaphore(url) for url in urls]
|
||||
results = await asyncio.gather(*tasks, return_exceptions=True)
|
||||
return [result if not isinstance(result, Exception) else str(result) for result in results]
|
||||
|
||||
async def aprocess_html(
|
||||
self,
|
||||
@@ -184,13 +227,14 @@ class AsyncWebCrawler:
|
||||
screenshot: str,
|
||||
verbose: bool,
|
||||
is_cached: bool,
|
||||
async_response: Optional[AsyncCrawlResponse],
|
||||
**kwargs,
|
||||
) -> CrawlResult:
|
||||
t = time.time()
|
||||
# Extract content from HTML
|
||||
try:
|
||||
t1 = time.time()
|
||||
scrapping_strategy = WebScrappingStrategy()
|
||||
scrapping_strategy = WebScrapingStrategy()
|
||||
# result = await scrapping_strategy.ascrap(
|
||||
result = scrapping_strategy.scrap(
|
||||
url,
|
||||
@@ -245,6 +289,12 @@ class AsyncWebCrawler:
|
||||
)
|
||||
|
||||
screenshot = None if not screenshot else screenshot
|
||||
|
||||
response_headers = "{}" # Default value
|
||||
if async_response:
|
||||
# Serialize response_headers dict to JSON string
|
||||
response_headers = json.dumps(async_response.response_headers, ensure_ascii=False)
|
||||
|
||||
|
||||
if not is_cached or kwargs.get("bypass_cache", False) or self.always_by_pass_cache:
|
||||
await async_db_manager.acache_url(
|
||||
@@ -258,6 +308,7 @@ class AsyncWebCrawler:
|
||||
json.dumps(links),
|
||||
json.dumps(metadata),
|
||||
screenshot=screenshot,
|
||||
response_headers=response_headers,
|
||||
)
|
||||
|
||||
return CrawlResult(
|
||||
|
||||
@@ -15,7 +15,7 @@ class ContentCleaningStrategy:
|
||||
self.link_density_threshold = 0.2
|
||||
self.max_dom_depth = 10 # To prevent excessive DOM traversal
|
||||
|
||||
def clean(self, clean_html: str) -> str:
|
||||
def clean(self, clean_html: str, soup = None) -> str:
|
||||
"""
|
||||
Main function that takes cleaned HTML and returns super cleaned HTML.
|
||||
|
||||
@@ -28,18 +28,20 @@ class ContentCleaningStrategy:
|
||||
try:
|
||||
if not clean_html or not isinstance(clean_html, str):
|
||||
return ''
|
||||
soup = BeautifulSoup(clean_html, 'html.parser')
|
||||
if not soup:
|
||||
# soup = BeautifulSoup(clean_html, 'html.parser')
|
||||
soup = BeautifulSoup(clean_html, 'lxml')
|
||||
main_content = self.extract_main_content(soup)
|
||||
if main_content:
|
||||
super_clean_element = self.clean_element(main_content)
|
||||
return str(super_clean_element)
|
||||
return super_clean_element.encode_contents().decode('utf-8')
|
||||
else:
|
||||
return ''
|
||||
except Exception:
|
||||
# Handle exceptions silently or log them as needed
|
||||
return ''
|
||||
|
||||
def extract_main_content(self, soup: BeautifulSoup) -> Optional[Tag]:
|
||||
def extract_main_content(self, soup) -> Optional[Tag]:
|
||||
"""
|
||||
Identifies and extracts the main content element from the HTML.
|
||||
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
import re # Point 1: Pre-Compile Regular Expressions
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Dict, Any
|
||||
from bs4 import BeautifulSoup
|
||||
@@ -105,7 +106,39 @@ class CustomHTML2Text(HTML2Text):
|
||||
return
|
||||
super().handle_data(data, entity_char)
|
||||
|
||||
class ContentScrappingStrategy(ABC):
|
||||
# Pre-compile regular expressions for Open Graph and Twitter metadata
|
||||
OG_REGEX = re.compile(r'^og:')
|
||||
TWITTER_REGEX = re.compile(r'^twitter:')
|
||||
DIMENSION_REGEX = re.compile(r"(\d+)(\D*)")
|
||||
|
||||
# Function to parse image height/width value and units
|
||||
def parse_dimension(dimension):
|
||||
if dimension:
|
||||
# match = re.match(r"(\d+)(\D*)", dimension)
|
||||
match = DIMENSION_REGEX.match(dimension)
|
||||
if match:
|
||||
number = int(match.group(1))
|
||||
unit = match.group(2) or 'px' # Default unit is 'px' if not specified
|
||||
return number, unit
|
||||
return None, None
|
||||
|
||||
# Fetch image file metadata to extract size and extension
|
||||
def fetch_image_file_size(img, base_url):
|
||||
#If src is relative path construct full URL, if not it may be CDN URL
|
||||
img_url = urljoin(base_url,img.get('src'))
|
||||
try:
|
||||
response = requests.head(img_url)
|
||||
if response.status_code == 200:
|
||||
return response.headers.get('Content-Length',None)
|
||||
else:
|
||||
print(f"Failed to retrieve file size for {img_url}")
|
||||
return None
|
||||
except InvalidSchema as e:
|
||||
return None
|
||||
finally:
|
||||
return
|
||||
|
||||
class ContentScrapingStrategy(ABC):
|
||||
@abstractmethod
|
||||
def scrap(self, url: str, html: str, **kwargs) -> Dict[str, Any]:
|
||||
pass
|
||||
@@ -114,7 +147,7 @@ class ContentScrappingStrategy(ABC):
|
||||
async def ascrap(self, url: str, html: str, **kwargs) -> Dict[str, Any]:
|
||||
pass
|
||||
|
||||
class WebScrappingStrategy(ContentScrappingStrategy):
|
||||
class WebScrapingStrategy(ContentScrapingStrategy):
|
||||
def scrap(self, url: str, html: str, **kwargs) -> Dict[str, Any]:
|
||||
return self._get_content_of_website_optimized(url, html, is_async=False, **kwargs)
|
||||
|
||||
@@ -126,9 +159,16 @@ class WebScrappingStrategy(ContentScrappingStrategy):
|
||||
if not html:
|
||||
return None
|
||||
|
||||
soup = BeautifulSoup(html, 'html.parser')
|
||||
# soup = BeautifulSoup(html, 'html.parser')
|
||||
soup = BeautifulSoup(html, 'lxml')
|
||||
body = soup.body
|
||||
|
||||
try:
|
||||
meta = extract_metadata("", soup)
|
||||
except Exception as e:
|
||||
print('Error extracting metadata:', str(e))
|
||||
meta = {}
|
||||
|
||||
|
||||
image_description_min_word_threshold = kwargs.get('image_description_min_word_threshold', IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD)
|
||||
|
||||
@@ -187,31 +227,7 @@ class WebScrappingStrategy(ContentScrappingStrategy):
|
||||
|
||||
#Score an image for it's usefulness
|
||||
def score_image_for_usefulness(img, base_url, index, images_count):
|
||||
# Function to parse image height/width value and units
|
||||
def parse_dimension(dimension):
|
||||
if dimension:
|
||||
match = re.match(r"(\d+)(\D*)", dimension)
|
||||
if match:
|
||||
number = int(match.group(1))
|
||||
unit = match.group(2) or 'px' # Default unit is 'px' if not specified
|
||||
return number, unit
|
||||
return None, None
|
||||
|
||||
# Fetch image file metadata to extract size and extension
|
||||
def fetch_image_file_size(img, base_url):
|
||||
#If src is relative path construct full URL, if not it may be CDN URL
|
||||
img_url = urljoin(base_url,img.get('src'))
|
||||
try:
|
||||
response = requests.head(img_url)
|
||||
if response.status_code == 200:
|
||||
return response.headers.get('Content-Length',None)
|
||||
else:
|
||||
print(f"Failed to retrieve file size for {img_url}")
|
||||
return None
|
||||
except InvalidSchema as e:
|
||||
return None
|
||||
finally:
|
||||
return
|
||||
|
||||
image_height = img.get('height')
|
||||
height_value, height_unit = parse_dimension(image_height)
|
||||
@@ -294,7 +310,6 @@ class WebScrappingStrategy(ContentScrappingStrategy):
|
||||
|
||||
exclude_social_media_domains = SOCIAL_MEDIA_DOMAINS + kwargs.get('exclude_social_media_domains', [])
|
||||
exclude_social_media_domains = list(set(exclude_social_media_domains))
|
||||
|
||||
|
||||
try:
|
||||
if element.name == 'a' and element.get('href'):
|
||||
@@ -439,15 +454,7 @@ class WebScrappingStrategy(ContentScrappingStrategy):
|
||||
except Exception as e:
|
||||
print('Error processing element:', str(e))
|
||||
return False
|
||||
|
||||
#process images by filtering and extracting contextual text from the page
|
||||
# imgs = body.find_all('img')
|
||||
# media['images'] = [
|
||||
# result for result in
|
||||
# (process_image(img, url, i, len(imgs)) for i, img in enumerate(imgs))
|
||||
# if result is not None
|
||||
# ]
|
||||
|
||||
|
||||
process_element(body)
|
||||
|
||||
# Update the links dictionary with unique links
|
||||
@@ -478,8 +485,9 @@ class WebScrappingStrategy(ContentScrappingStrategy):
|
||||
# Replace base64 data with empty string
|
||||
img['src'] = base64_pattern.sub('', src)
|
||||
|
||||
str_body = ""
|
||||
try:
|
||||
str(body)
|
||||
str_body = body.encode_contents().decode('utf-8')
|
||||
except Exception as e:
|
||||
# Reset body to the original HTML
|
||||
success = False
|
||||
@@ -504,11 +512,12 @@ class WebScrappingStrategy(ContentScrappingStrategy):
|
||||
|
||||
# Append the error div to the body
|
||||
body.body.append(error_div)
|
||||
str_body = body.encode_contents().decode('utf-8')
|
||||
|
||||
print(f"[LOG] 😧 Error: After processing the crawled HTML and removing irrelevant tags, nothing was left in the page. Check the markdown for further details.")
|
||||
|
||||
|
||||
cleaned_html = str(body).replace('\n\n', '\n').replace(' ', ' ')
|
||||
cleaned_html = str_body.replace('\n\n', '\n').replace(' ', ' ')
|
||||
|
||||
try:
|
||||
h = CustomHTML2Text()
|
||||
@@ -518,15 +527,14 @@ class WebScrappingStrategy(ContentScrappingStrategy):
|
||||
markdown = h.handle(sanitize_html(cleaned_html))
|
||||
markdown = markdown.replace(' ```', '```')
|
||||
|
||||
try:
|
||||
meta = extract_metadata(html, soup)
|
||||
except Exception as e:
|
||||
print('Error extracting metadata:', str(e))
|
||||
meta = {}
|
||||
|
||||
|
||||
cleaner = ContentCleaningStrategy()
|
||||
fit_html = cleaner.clean(cleaned_html)
|
||||
fit_markdown = h.handle(fit_html)
|
||||
fit_markdown = "Set flag 'fit_markdown' to True to get cleaned HTML content."
|
||||
fit_html = "Set flag 'fit_markdown' to True to get cleaned HTML content."
|
||||
if kwargs.get('fit_markdown', False):
|
||||
cleaner = ContentCleaningStrategy()
|
||||
fit_html = cleaner.clean(cleaned_html)
|
||||
fit_markdown = h.handle(fit_html)
|
||||
|
||||
cleaned_html = sanitize_html(cleaned_html)
|
||||
return {
|
||||
|
||||
@@ -736,46 +736,54 @@ def get_content_of_website_optimized(url: str, html: str, word_count_threshold:
|
||||
'metadata': meta
|
||||
}
|
||||
|
||||
def extract_metadata(html, soup = None):
|
||||
def extract_metadata(html, soup=None):
|
||||
metadata = {}
|
||||
|
||||
if not html:
|
||||
if not html and not soup:
|
||||
return {}
|
||||
|
||||
if not soup:
|
||||
soup = BeautifulSoup(html, 'lxml')
|
||||
|
||||
head = soup.head
|
||||
if not head:
|
||||
return metadata
|
||||
|
||||
# Parse HTML content with BeautifulSoup
|
||||
if not soup:
|
||||
soup = BeautifulSoup(html, 'html.parser')
|
||||
|
||||
# Title
|
||||
title_tag = soup.find('title')
|
||||
metadata['title'] = title_tag.string if title_tag else None
|
||||
title_tag = head.find('title')
|
||||
metadata['title'] = title_tag.string.strip() if title_tag and title_tag.string else None
|
||||
|
||||
# Meta description
|
||||
description_tag = soup.find('meta', attrs={'name': 'description'})
|
||||
metadata['description'] = description_tag['content'] if description_tag else None
|
||||
description_tag = head.find('meta', attrs={'name': 'description'})
|
||||
metadata['description'] = description_tag.get('content', '').strip() if description_tag else None
|
||||
|
||||
# Meta keywords
|
||||
keywords_tag = soup.find('meta', attrs={'name': 'keywords'})
|
||||
metadata['keywords'] = keywords_tag['content'] if keywords_tag else None
|
||||
keywords_tag = head.find('meta', attrs={'name': 'keywords'})
|
||||
metadata['keywords'] = keywords_tag.get('content', '').strip() if keywords_tag else None
|
||||
|
||||
# Meta author
|
||||
author_tag = soup.find('meta', attrs={'name': 'author'})
|
||||
metadata['author'] = author_tag['content'] if author_tag else None
|
||||
author_tag = head.find('meta', attrs={'name': 'author'})
|
||||
metadata['author'] = author_tag.get('content', '').strip() if author_tag else None
|
||||
|
||||
# Open Graph metadata
|
||||
og_tags = soup.find_all('meta', attrs={'property': lambda value: value and value.startswith('og:')})
|
||||
og_tags = head.find_all('meta', attrs={'property': re.compile(r'^og:')})
|
||||
for tag in og_tags:
|
||||
property_name = tag['property']
|
||||
metadata[property_name] = tag['content']
|
||||
property_name = tag.get('property', '').strip()
|
||||
content = tag.get('content', '').strip()
|
||||
if property_name and content:
|
||||
metadata[property_name] = content
|
||||
|
||||
# Twitter Card metadata
|
||||
twitter_tags = soup.find_all('meta', attrs={'name': lambda value: value and value.startswith('twitter:')})
|
||||
twitter_tags = head.find_all('meta', attrs={'name': re.compile(r'^twitter:')})
|
||||
for tag in twitter_tags:
|
||||
property_name = tag['name']
|
||||
metadata[property_name] = tag['content']
|
||||
|
||||
property_name = tag.get('name', '').strip()
|
||||
content = tag.get('content', '').strip()
|
||||
if property_name and content:
|
||||
metadata[property_name] = content
|
||||
|
||||
return metadata
|
||||
|
||||
|
||||
def extract_xml_tags(string):
|
||||
tags = re.findall(r'<(\w+)>', string)
|
||||
return list(set(tags))
|
||||
|
||||
@@ -10,7 +10,7 @@ from .extraction_strategy import *
|
||||
from .crawler_strategy import *
|
||||
from typing import List
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from .content_scrapping_strategy import WebScrappingStrategy
|
||||
from .content_scrapping_strategy import WebScrapingStrategy
|
||||
from .config import *
|
||||
import warnings
|
||||
import json
|
||||
@@ -182,7 +182,7 @@ class WebCrawler:
|
||||
# Extract content from HTML
|
||||
try:
|
||||
t1 = time.time()
|
||||
scrapping_strategy = WebScrappingStrategy()
|
||||
scrapping_strategy = WebScrapingStrategy()
|
||||
extra_params = {k: v for k, v in kwargs.items() if k not in ["only_text", "image_description_min_word_threshold"]}
|
||||
result = scrapping_strategy.scrap(
|
||||
url,
|
||||
|
||||
Reference in New Issue
Block a user