feat: Enhance crawler flexibility and LLM extraction capabilities
- Add browser type selection (Chromium, Firefox, WebKit) - Implement iframe content extraction - Improve image processing and dimension updates - Add custom headers support in AsyncPlaywrightCrawlerStrategy - Enhance delayed content retrieval with new parameter - Optimize HTML sanitization and Markdown conversion - Update examples in quickstart_async.py for new features
This commit is contained in:
3
.gitignore
vendored
3
.gitignore
vendored
@@ -203,4 +203,5 @@ git_changes.py
|
|||||||
git_changes.md
|
git_changes.md
|
||||||
pypi_build.sh
|
pypi_build.sh
|
||||||
|
|
||||||
.tests/
|
.tests/
|
||||||
|
git_changes.py
|
||||||
@@ -50,7 +50,8 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
|||||||
self.user_agent = kwargs.get("user_agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
|
self.user_agent = kwargs.get("user_agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
|
||||||
self.proxy = kwargs.get("proxy")
|
self.proxy = kwargs.get("proxy")
|
||||||
self.headless = kwargs.get("headless", True)
|
self.headless = kwargs.get("headless", True)
|
||||||
self.headers = {}
|
self.browser_type = kwargs.get("browser_type", "chromium") # New parameter
|
||||||
|
self.headers = kwargs.get("headers", {})
|
||||||
self.sessions = {}
|
self.sessions = {}
|
||||||
self.session_ttl = 1800
|
self.session_ttl = 1800
|
||||||
self.js_code = js_code
|
self.js_code = js_code
|
||||||
@@ -80,7 +81,6 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
|||||||
if self.browser is None:
|
if self.browser is None:
|
||||||
browser_args = {
|
browser_args = {
|
||||||
"headless": self.headless,
|
"headless": self.headless,
|
||||||
# "headless": False,
|
|
||||||
"args": [
|
"args": [
|
||||||
"--disable-gpu",
|
"--disable-gpu",
|
||||||
"--disable-dev-shm-usage",
|
"--disable-dev-shm-usage",
|
||||||
@@ -95,7 +95,14 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
|||||||
browser_args["proxy"] = proxy_settings
|
browser_args["proxy"] = proxy_settings
|
||||||
|
|
||||||
|
|
||||||
self.browser = await self.playwright.chromium.launch(**browser_args)
|
# Select the appropriate browser based on the browser_type
|
||||||
|
if self.browser_type == "firefox":
|
||||||
|
self.browser = await self.playwright.firefox.launch(**browser_args)
|
||||||
|
elif self.browser_type == "webkit":
|
||||||
|
self.browser = await self.playwright.webkit.launch(**browser_args)
|
||||||
|
else:
|
||||||
|
self.browser = await self.playwright.chromium.launch(**browser_args)
|
||||||
|
|
||||||
await self.execute_hook('on_browser_created', self.browser)
|
await self.execute_hook('on_browser_created', self.browser)
|
||||||
|
|
||||||
async def close(self):
|
async def close(self):
|
||||||
@@ -145,7 +152,6 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
|||||||
for sid in expired_sessions:
|
for sid in expired_sessions:
|
||||||
asyncio.create_task(self.kill_session(sid))
|
asyncio.create_task(self.kill_session(sid))
|
||||||
|
|
||||||
|
|
||||||
async def smart_wait(self, page: Page, wait_for: str, timeout: float = 30000):
|
async def smart_wait(self, page: Page, wait_for: str, timeout: float = 30000):
|
||||||
wait_for = wait_for.strip()
|
wait_for = wait_for.strip()
|
||||||
|
|
||||||
@@ -209,6 +215,48 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
raise RuntimeError(f"Error in wait condition: {str(e)}")
|
raise RuntimeError(f"Error in wait condition: {str(e)}")
|
||||||
|
|
||||||
|
async def process_iframes(self, page):
|
||||||
|
# Find all iframes
|
||||||
|
iframes = await page.query_selector_all('iframe')
|
||||||
|
|
||||||
|
for i, iframe in enumerate(iframes):
|
||||||
|
try:
|
||||||
|
# Add a unique identifier to the iframe
|
||||||
|
await iframe.evaluate(f'(element) => element.id = "iframe-{i}"')
|
||||||
|
|
||||||
|
# Get the frame associated with this iframe
|
||||||
|
frame = await iframe.content_frame()
|
||||||
|
|
||||||
|
if frame:
|
||||||
|
# Wait for the frame to load
|
||||||
|
await frame.wait_for_load_state('load', timeout=30000) # 30 seconds timeout
|
||||||
|
|
||||||
|
# Extract the content of the iframe's body
|
||||||
|
iframe_content = await frame.evaluate('() => document.body.innerHTML')
|
||||||
|
|
||||||
|
# Generate a unique class name for this iframe
|
||||||
|
class_name = f'extracted-iframe-content-{i}'
|
||||||
|
|
||||||
|
# Replace the iframe with a div containing the extracted content
|
||||||
|
_iframe = iframe_content.replace('`', '\\`')
|
||||||
|
await page.evaluate(f"""
|
||||||
|
() => {{
|
||||||
|
const iframe = document.getElementById('iframe-{i}');
|
||||||
|
const div = document.createElement('div');
|
||||||
|
div.innerHTML = `{_iframe}`;
|
||||||
|
div.className = '{class_name}';
|
||||||
|
iframe.replaceWith(div);
|
||||||
|
}}
|
||||||
|
""")
|
||||||
|
else:
|
||||||
|
print(f"Warning: Could not access content frame for iframe {i}")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error processing iframe {i}: {str(e)}")
|
||||||
|
|
||||||
|
# Return the page object
|
||||||
|
return page
|
||||||
|
|
||||||
|
|
||||||
async def crawl(self, url: str, **kwargs) -> AsyncCrawlResponse:
|
async def crawl(self, url: str, **kwargs) -> AsyncCrawlResponse:
|
||||||
response_headers = {}
|
response_headers = {}
|
||||||
status_code = None
|
status_code = None
|
||||||
@@ -263,6 +311,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
|||||||
status_code = 200
|
status_code = 200
|
||||||
response_headers = {}
|
response_headers = {}
|
||||||
|
|
||||||
|
|
||||||
await page.wait_for_selector('body')
|
await page.wait_for_selector('body')
|
||||||
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
|
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
|
||||||
|
|
||||||
@@ -305,11 +354,78 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
|||||||
if kwargs.get("screenshot"):
|
if kwargs.get("screenshot"):
|
||||||
screenshot_data = await self.take_screenshot(url)
|
screenshot_data = await self.take_screenshot(url)
|
||||||
|
|
||||||
|
|
||||||
|
# New code to update image dimensions
|
||||||
|
update_image_dimensions_js = """
|
||||||
|
() => {
|
||||||
|
return new Promise((resolve) => {
|
||||||
|
const filterImage = (img) => {
|
||||||
|
// Filter out images that are too small
|
||||||
|
if (img.width < 100 && img.height < 100) return false;
|
||||||
|
|
||||||
|
// Filter out images that are not visible
|
||||||
|
const rect = img.getBoundingClientRect();
|
||||||
|
if (rect.width === 0 || rect.height === 0) return false;
|
||||||
|
|
||||||
|
// Filter out images with certain class names (e.g., icons, thumbnails)
|
||||||
|
if (img.classList.contains('icon') || img.classList.contains('thumbnail')) return false;
|
||||||
|
|
||||||
|
// Filter out images with certain patterns in their src (e.g., placeholder images)
|
||||||
|
if (img.src.includes('placeholder') || img.src.includes('icon')) return false;
|
||||||
|
|
||||||
|
return true;
|
||||||
|
};
|
||||||
|
|
||||||
|
const images = Array.from(document.querySelectorAll('img')).filter(filterImage);
|
||||||
|
let imagesLeft = images.length;
|
||||||
|
|
||||||
|
if (imagesLeft === 0) {
|
||||||
|
resolve();
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
const checkImage = (img) => {
|
||||||
|
if (img.complete && img.naturalWidth !== 0) {
|
||||||
|
img.setAttribute('width', img.naturalWidth);
|
||||||
|
img.setAttribute('height', img.naturalHeight);
|
||||||
|
imagesLeft--;
|
||||||
|
if (imagesLeft === 0) resolve();
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
images.forEach(img => {
|
||||||
|
checkImage(img);
|
||||||
|
if (!img.complete) {
|
||||||
|
img.onload = () => {
|
||||||
|
checkImage(img);
|
||||||
|
};
|
||||||
|
img.onerror = () => {
|
||||||
|
imagesLeft--;
|
||||||
|
if (imagesLeft === 0) resolve();
|
||||||
|
};
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
// Fallback timeout of 5 seconds
|
||||||
|
setTimeout(() => resolve(), 5000);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
"""
|
||||||
|
await page.evaluate(update_image_dimensions_js)
|
||||||
|
|
||||||
|
# Wait a bit for any onload events to complete
|
||||||
|
await page.wait_for_timeout(100)
|
||||||
|
|
||||||
|
# Process iframes
|
||||||
|
if kwargs.get("process_iframes", False):
|
||||||
|
page = await self.process_iframes(page)
|
||||||
|
|
||||||
await self.execute_hook('before_retrieve_html', page)
|
await self.execute_hook('before_retrieve_html', page)
|
||||||
# Check if delay_before_return_html is set then wait for that time
|
# Check if delay_before_return_html is set then wait for that time
|
||||||
delay_before_return_html = kwargs.get("delay_before_return_html")
|
delay_before_return_html = kwargs.get("delay_before_return_html")
|
||||||
if delay_before_return_html:
|
if delay_before_return_html:
|
||||||
await asyncio.sleep(delay_before_return_html)
|
await asyncio.sleep(delay_before_return_html)
|
||||||
|
|
||||||
html = await page.content()
|
html = await page.content()
|
||||||
await self.execute_hook('before_return_html', page, html)
|
await self.execute_hook('before_return_html', page, html)
|
||||||
|
|
||||||
@@ -398,7 +514,6 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
|||||||
except Error as e:
|
except Error as e:
|
||||||
raise Error(f"Failed to execute JavaScript or wait for condition in session {session_id}: {str(e)}")
|
raise Error(f"Failed to execute JavaScript or wait for condition in session {session_id}: {str(e)}")
|
||||||
|
|
||||||
|
|
||||||
async def crawl_many(self, urls: List[str], **kwargs) -> List[AsyncCrawlResponse]:
|
async def crawl_many(self, urls: List[str], **kwargs) -> List[AsyncCrawlResponse]:
|
||||||
semaphore_count = kwargs.get('semaphore_count', calculate_semaphore_count())
|
semaphore_count = kwargs.get('semaphore_count', calculate_semaphore_count())
|
||||||
semaphore = asyncio.Semaphore(semaphore_count)
|
semaphore = asyncio.Semaphore(semaphore_count)
|
||||||
|
|||||||
@@ -16,8 +16,6 @@ from .utils import (
|
|||||||
CustomHTML2Text
|
CustomHTML2Text
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class ContentScrappingStrategy(ABC):
|
class ContentScrappingStrategy(ABC):
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def scrap(self, url: str, html: str, **kwargs) -> Dict[str, Any]:
|
def scrap(self, url: str, html: str, **kwargs) -> Dict[str, Any]:
|
||||||
@@ -129,7 +127,7 @@ class WebScrappingStrategy(ContentScrappingStrategy):
|
|||||||
image_size = 0 #int(fetch_image_file_size(img,base_url) or 0)
|
image_size = 0 #int(fetch_image_file_size(img,base_url) or 0)
|
||||||
image_format = os.path.splitext(img.get('src',''))[1].lower()
|
image_format = os.path.splitext(img.get('src',''))[1].lower()
|
||||||
# Remove . from format
|
# Remove . from format
|
||||||
image_format = image_format.strip('.')
|
image_format = image_format.strip('.').split('?')[0]
|
||||||
score = 0
|
score = 0
|
||||||
if height_value:
|
if height_value:
|
||||||
if height_unit == 'px' and height_value > 150:
|
if height_unit == 'px' and height_value > 150:
|
||||||
@@ -158,6 +156,7 @@ class WebScrappingStrategy(ContentScrappingStrategy):
|
|||||||
return None
|
return None
|
||||||
return {
|
return {
|
||||||
'src': img.get('src', ''),
|
'src': img.get('src', ''),
|
||||||
|
'data-src': img.get('data-src', ''),
|
||||||
'alt': img.get('alt', ''),
|
'alt': img.get('alt', ''),
|
||||||
'desc': find_closest_parent_with_useful_text(img),
|
'desc': find_closest_parent_with_useful_text(img),
|
||||||
'score': score,
|
'score': score,
|
||||||
@@ -275,11 +274,14 @@ class WebScrappingStrategy(ContentScrappingStrategy):
|
|||||||
# Replace base64 data with empty string
|
# Replace base64 data with empty string
|
||||||
img['src'] = base64_pattern.sub('', src)
|
img['src'] = base64_pattern.sub('', src)
|
||||||
cleaned_html = str(body).replace('\n\n', '\n').replace(' ', ' ')
|
cleaned_html = str(body).replace('\n\n', '\n').replace(' ', ' ')
|
||||||
cleaned_html = sanitize_html(cleaned_html)
|
|
||||||
|
|
||||||
h = CustomHTML2Text()
|
h = CustomHTML2Text()
|
||||||
h.ignore_links = True
|
h.ignore_links = True
|
||||||
markdown = h.handle(cleaned_html)
|
h.body_width = 0
|
||||||
|
try:
|
||||||
|
markdown = h.handle(cleaned_html)
|
||||||
|
except Exception as e:
|
||||||
|
markdown = h.handle(sanitize_html(cleaned_html))
|
||||||
markdown = markdown.replace(' ```', '```')
|
markdown = markdown.replace(' ```', '```')
|
||||||
|
|
||||||
try:
|
try:
|
||||||
@@ -288,6 +290,7 @@ class WebScrappingStrategy(ContentScrappingStrategy):
|
|||||||
print('Error extracting metadata:', str(e))
|
print('Error extracting metadata:', str(e))
|
||||||
meta = {}
|
meta = {}
|
||||||
|
|
||||||
|
cleaned_html = sanitize_html(cleaned_html)
|
||||||
return {
|
return {
|
||||||
'markdown': markdown,
|
'markdown': markdown,
|
||||||
'cleaned_html': cleaned_html,
|
'cleaned_html': cleaned_html,
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
PROMPT_EXTRACT_BLOCKS = """YHere is the URL of the webpage:
|
PROMPT_EXTRACT_BLOCKS = """Here is the URL of the webpage:
|
||||||
<url>{URL}</url>
|
<url>{URL}</url>
|
||||||
|
|
||||||
And here is the cleaned HTML content of that webpage:
|
And here is the cleaned HTML content of that webpage:
|
||||||
@@ -79,7 +79,7 @@ To generate the JSON objects:
|
|||||||
2. For each block:
|
2. For each block:
|
||||||
a. Assign it an index based on its order in the content.
|
a. Assign it an index based on its order in the content.
|
||||||
b. Analyze the content and generate ONE semantic tag that describe what the block is about.
|
b. Analyze the content and generate ONE semantic tag that describe what the block is about.
|
||||||
c. Extract the text content, EXACTLY SAME AS GIVE DATA, clean it up if needed, and store it as a list of strings in the "content" field.
|
c. Extract the text content, EXACTLY SAME AS THE GIVE DATA, clean it up if needed, and store it as a list of strings in the "content" field.
|
||||||
|
|
||||||
3. Ensure that the order of the JSON objects matches the order of the blocks as they appear in the original HTML content.
|
3. Ensure that the order of the JSON objects matches the order of the blocks as they appear in the original HTML content.
|
||||||
|
|
||||||
|
|||||||
@@ -131,7 +131,7 @@ def split_and_parse_json_objects(json_string):
|
|||||||
return parsed_objects, unparsed_segments
|
return parsed_objects, unparsed_segments
|
||||||
|
|
||||||
def sanitize_html(html):
|
def sanitize_html(html):
|
||||||
# Replace all weird and special characters with an empty string
|
# Replace all unwanted and special characters with an empty string
|
||||||
sanitized_html = html
|
sanitized_html = html
|
||||||
# sanitized_html = re.sub(r'[^\w\s.,;:!?=\[\]{}()<>\/\\\-"]', '', html)
|
# sanitized_html = re.sub(r'[^\w\s.,;:!?=\[\]{}()<>\/\\\-"]', '', html)
|
||||||
|
|
||||||
@@ -301,7 +301,7 @@ def get_content_of_website(url, html, word_count_threshold = MIN_WORD_THRESHOLD,
|
|||||||
if tag.name != 'img':
|
if tag.name != 'img':
|
||||||
tag.attrs = {}
|
tag.attrs = {}
|
||||||
|
|
||||||
# Extract all img tgas inti [{src: '', alt: ''}]
|
# Extract all img tgas int0 [{src: '', alt: ''}]
|
||||||
media = {
|
media = {
|
||||||
'images': [],
|
'images': [],
|
||||||
'videos': [],
|
'videos': [],
|
||||||
@@ -339,7 +339,7 @@ def get_content_of_website(url, html, word_count_threshold = MIN_WORD_THRESHOLD,
|
|||||||
img.decompose()
|
img.decompose()
|
||||||
|
|
||||||
|
|
||||||
# Create a function that replace content of all"pre" tage with its inner text
|
# Create a function that replace content of all"pre" tag with its inner text
|
||||||
def replace_pre_tags_with_text(node):
|
def replace_pre_tags_with_text(node):
|
||||||
for child in node.find_all('pre'):
|
for child in node.find_all('pre'):
|
||||||
# set child inner html to its text
|
# set child inner html to its text
|
||||||
@@ -502,7 +502,7 @@ def get_content_of_website_optimized(url: str, html: str, word_count_threshold:
|
|||||||
current_tag = tag
|
current_tag = tag
|
||||||
while current_tag:
|
while current_tag:
|
||||||
current_tag = current_tag.parent
|
current_tag = current_tag.parent
|
||||||
# Get the text content of the parent tag
|
# Get the text content from the parent tag
|
||||||
if current_tag:
|
if current_tag:
|
||||||
text_content = current_tag.get_text(separator=' ',strip=True)
|
text_content = current_tag.get_text(separator=' ',strip=True)
|
||||||
# Check if the text content has at least word_count_threshold
|
# Check if the text content has at least word_count_threshold
|
||||||
@@ -511,88 +511,88 @@ def get_content_of_website_optimized(url: str, html: str, word_count_threshold:
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
def process_image(img, url, index, total_images):
|
def process_image(img, url, index, total_images):
|
||||||
#Check if an image has valid display and inside undesired html elements
|
#Check if an image has valid display and inside undesired html elements
|
||||||
def is_valid_image(img, parent, parent_classes):
|
def is_valid_image(img, parent, parent_classes):
|
||||||
style = img.get('style', '')
|
style = img.get('style', '')
|
||||||
src = img.get('src', '')
|
src = img.get('src', '')
|
||||||
classes_to_check = ['button', 'icon', 'logo']
|
classes_to_check = ['button', 'icon', 'logo']
|
||||||
tags_to_check = ['button', 'input']
|
tags_to_check = ['button', 'input']
|
||||||
return all([
|
return all([
|
||||||
'display:none' not in style,
|
'display:none' not in style,
|
||||||
src,
|
src,
|
||||||
not any(s in var for var in [src, img.get('alt', ''), *parent_classes] for s in classes_to_check),
|
not any(s in var for var in [src, img.get('alt', ''), *parent_classes] for s in classes_to_check),
|
||||||
parent.name not in tags_to_check
|
parent.name not in tags_to_check
|
||||||
])
|
])
|
||||||
|
|
||||||
#Score an image for it's usefulness
|
#Score an image for it's usefulness
|
||||||
def score_image_for_usefulness(img, base_url, index, images_count):
|
def score_image_for_usefulness(img, base_url, index, images_count):
|
||||||
# Function to parse image height/width value and units
|
# Function to parse image height/width value and units
|
||||||
def parse_dimension(dimension):
|
def parse_dimension(dimension):
|
||||||
if dimension:
|
if dimension:
|
||||||
match = re.match(r"(\d+)(\D*)", dimension)
|
match = re.match(r"(\d+)(\D*)", dimension)
|
||||||
if match:
|
if match:
|
||||||
number = int(match.group(1))
|
number = int(match.group(1))
|
||||||
unit = match.group(2) or 'px' # Default unit is 'px' if not specified
|
unit = match.group(2) or 'px' # Default unit is 'px' if not specified
|
||||||
return number, unit
|
return number, unit
|
||||||
return None, None
|
return None, None
|
||||||
|
|
||||||
# Fetch image file metadata to extract size and extension
|
# Fetch image file metadata to extract size and extension
|
||||||
def fetch_image_file_size(img, base_url):
|
def fetch_image_file_size(img, base_url):
|
||||||
#If src is relative path construct full URL, if not it may be CDN URL
|
#If src is relative path construct full URL, if not it may be CDN URL
|
||||||
img_url = urljoin(base_url,img.get('src'))
|
img_url = urljoin(base_url,img.get('src'))
|
||||||
try:
|
try:
|
||||||
response = requests.head(img_url)
|
response = requests.head(img_url)
|
||||||
if response.status_code == 200:
|
if response.status_code == 200:
|
||||||
return response.headers.get('Content-Length',None)
|
return response.headers.get('Content-Length',None)
|
||||||
else:
|
else:
|
||||||
print(f"Failed to retrieve file size for {img_url}")
|
print(f"Failed to retrieve file size for {img_url}")
|
||||||
return None
|
|
||||||
except InvalidSchema as e:
|
|
||||||
return None
|
return None
|
||||||
finally:
|
except InvalidSchema as e:
|
||||||
return
|
return None
|
||||||
|
finally:
|
||||||
|
return
|
||||||
|
|
||||||
image_height = img.get('height')
|
image_height = img.get('height')
|
||||||
height_value, height_unit = parse_dimension(image_height)
|
height_value, height_unit = parse_dimension(image_height)
|
||||||
image_width = img.get('width')
|
image_width = img.get('width')
|
||||||
width_value, width_unit = parse_dimension(image_width)
|
width_value, width_unit = parse_dimension(image_width)
|
||||||
image_size = 0 #int(fetch_image_file_size(img,base_url) or 0)
|
image_size = 0 #int(fetch_image_file_size(img,base_url) or 0)
|
||||||
image_format = os.path.splitext(img.get('src',''))[1].lower()
|
image_format = os.path.splitext(img.get('src',''))[1].lower()
|
||||||
# Remove . from format
|
# Remove . from format
|
||||||
image_format = image_format.strip('.')
|
image_format = image_format.strip('.')
|
||||||
score = 0
|
score = 0
|
||||||
if height_value:
|
if height_value:
|
||||||
if height_unit == 'px' and height_value > 150:
|
if height_unit == 'px' and height_value > 150:
|
||||||
score += 1
|
|
||||||
if height_unit in ['%','vh','vmin','vmax'] and height_value >30:
|
|
||||||
score += 1
|
|
||||||
if width_value:
|
|
||||||
if width_unit == 'px' and width_value > 150:
|
|
||||||
score += 1
|
|
||||||
if width_unit in ['%','vh','vmin','vmax'] and width_value >30:
|
|
||||||
score += 1
|
|
||||||
if image_size > 10000:
|
|
||||||
score += 1
|
score += 1
|
||||||
if img.get('alt') != '':
|
if height_unit in ['%','vh','vmin','vmax'] and height_value >30:
|
||||||
score+=1
|
score += 1
|
||||||
if any(image_format==format for format in ['jpg','png','webp']):
|
if width_value:
|
||||||
score+=1
|
if width_unit == 'px' and width_value > 150:
|
||||||
if index/images_count<0.5:
|
score += 1
|
||||||
score+=1
|
if width_unit in ['%','vh','vmin','vmax'] and width_value >30:
|
||||||
return score
|
score += 1
|
||||||
|
if image_size > 10000:
|
||||||
|
score += 1
|
||||||
|
if img.get('alt') != '':
|
||||||
|
score+=1
|
||||||
|
if any(image_format==format for format in ['jpg','png','webp']):
|
||||||
|
score+=1
|
||||||
|
if index/images_count<0.5:
|
||||||
|
score+=1
|
||||||
|
return score
|
||||||
|
|
||||||
if not is_valid_image(img, img.parent, img.parent.get('class', [])):
|
if not is_valid_image(img, img.parent, img.parent.get('class', [])):
|
||||||
return None
|
return None
|
||||||
score = score_image_for_usefulness(img, url, index, total_images)
|
score = score_image_for_usefulness(img, url, index, total_images)
|
||||||
if score <= IMAGE_SCORE_THRESHOLD:
|
if score <= IMAGE_SCORE_THRESHOLD:
|
||||||
return None
|
return None
|
||||||
return {
|
return {
|
||||||
'src': img.get('src', ''),
|
'src': img.get('src', '').replace('\\"', '"').strip(),
|
||||||
'alt': img.get('alt', ''),
|
'alt': img.get('alt', ''),
|
||||||
'desc': find_closest_parent_with_useful_text(img),
|
'desc': find_closest_parent_with_useful_text(img),
|
||||||
'score': score,
|
'score': score,
|
||||||
'type': 'image'
|
'type': 'image'
|
||||||
}
|
}
|
||||||
|
|
||||||
def process_element(element: element.PageElement) -> bool:
|
def process_element(element: element.PageElement) -> bool:
|
||||||
try:
|
try:
|
||||||
|
|||||||
@@ -12,6 +12,7 @@ from typing import List
|
|||||||
from concurrent.futures import ThreadPoolExecutor
|
from concurrent.futures import ThreadPoolExecutor
|
||||||
from .config import *
|
from .config import *
|
||||||
import warnings
|
import warnings
|
||||||
|
import json
|
||||||
warnings.filterwarnings("ignore", message='Field "model_name" has conflict with protected namespace "model_".')
|
warnings.filterwarnings("ignore", message='Field "model_name" has conflict with protected namespace "model_".')
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -357,6 +357,28 @@ async def crawl_dynamic_content_pages_method_3():
|
|||||||
await crawler.crawler_strategy.kill_session(session_id)
|
await crawler.crawler_strategy.kill_session(session_id)
|
||||||
print(f"Successfully crawled {len(all_commits)} commits across 3 pages")
|
print(f"Successfully crawled {len(all_commits)} commits across 3 pages")
|
||||||
|
|
||||||
|
async def crawl_custom_browser_type():
|
||||||
|
# Use Firefox
|
||||||
|
start = time.time()
|
||||||
|
async with AsyncWebCrawler(browser_type="firefox", verbose=True, headless = True) as crawler:
|
||||||
|
result = await crawler.arun(url="https://www.example.com", bypass_cache=True)
|
||||||
|
print(result.markdown[:500])
|
||||||
|
print("Time taken: ", time.time() - start)
|
||||||
|
|
||||||
|
# Use WebKit
|
||||||
|
start = time.time()
|
||||||
|
async with AsyncWebCrawler(browser_type="webkit", verbose=True, headless = True) as crawler:
|
||||||
|
result = await crawler.arun(url="https://www.example.com", bypass_cache=True)
|
||||||
|
print(result.markdown[:500])
|
||||||
|
print("Time taken: ", time.time() - start)
|
||||||
|
|
||||||
|
# Use Chromium (default)
|
||||||
|
start = time.time()
|
||||||
|
async with AsyncWebCrawler(verbose=True, headless = True) as crawler:
|
||||||
|
result = await crawler.arun(url="https://www.example.com", bypass_cache=True)
|
||||||
|
print(result.markdown[:500])
|
||||||
|
print("Time taken: ", time.time() - start)
|
||||||
|
|
||||||
async def speed_comparison():
|
async def speed_comparison():
|
||||||
# print("\n--- Speed Comparison ---")
|
# print("\n--- Speed Comparison ---")
|
||||||
# print("Firecrawl (simulated):")
|
# print("Firecrawl (simulated):")
|
||||||
@@ -446,6 +468,9 @@ async def main():
|
|||||||
# await crawl_dynamic_content_pages_method_1()
|
# await crawl_dynamic_content_pages_method_1()
|
||||||
# await crawl_dynamic_content_pages_method_2()
|
# await crawl_dynamic_content_pages_method_2()
|
||||||
await crawl_dynamic_content_pages_method_3()
|
await crawl_dynamic_content_pages_method_3()
|
||||||
|
|
||||||
|
await crawl_custom_browser_type()
|
||||||
|
|
||||||
await speed_comparison()
|
await speed_comparison()
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user