Enhance crawler strategies with new features
- ReImplemented JsonXPathExtractionStrategy for enhanced JSON data extraction. - Updated existing extraction strategies for better performance. - Improved handling of response status codes during crawls.
This commit is contained in:
@@ -709,7 +709,7 @@ This commit introduces several key enhancements, including improved error handli
|
|||||||
- Improved `AsyncPlaywrightCrawlerStrategy.close()` method to use a shorter sleep time (0.5 seconds instead of 500), significantly reducing wait time when closing the crawler.
|
- Improved `AsyncPlaywrightCrawlerStrategy.close()` method to use a shorter sleep time (0.5 seconds instead of 500), significantly reducing wait time when closing the crawler.
|
||||||
- Enhanced flexibility in `CosineStrategy`:
|
- Enhanced flexibility in `CosineStrategy`:
|
||||||
- Now uses a more generic `load_HF_embedding_model` function, allowing for easier swapping of embedding models.
|
- Now uses a more generic `load_HF_embedding_model` function, allowing for easier swapping of embedding models.
|
||||||
- Updated `JsonCssExtractionStrategy` and `JsonXPATHExtractionStrategy` for better JSON-based extraction.
|
- Updated `JsonCssExtractionStrategy` and `JsonXPathExtractionStrategy` for better JSON-based extraction.
|
||||||
|
|
||||||
### Fixed
|
### Fixed
|
||||||
- Addressed potential issues with the sliding window chunking strategy to ensure all text is properly chunked.
|
- Addressed potential issues with the sliding window chunking strategy to ensure all text is properly chunked.
|
||||||
|
|||||||
@@ -795,9 +795,14 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
|||||||
raise RuntimeError(f"Failed on navigating ACS-GOTO:\n{str(e)}")
|
raise RuntimeError(f"Failed on navigating ACS-GOTO:\n{str(e)}")
|
||||||
|
|
||||||
await self.execute_hook('after_goto', page, context=context)
|
await self.execute_hook('after_goto', page, context=context)
|
||||||
|
|
||||||
|
if response is None:
|
||||||
|
status_code = 200
|
||||||
|
response_headers = {}
|
||||||
|
else:
|
||||||
|
status_code = response.status
|
||||||
|
response_headers = response.headers
|
||||||
|
|
||||||
status_code = response.status
|
|
||||||
response_headers = response.headers
|
|
||||||
else:
|
else:
|
||||||
status_code = 200
|
status_code = 200
|
||||||
response_headers = {}
|
response_headers = {}
|
||||||
|
|||||||
@@ -274,6 +274,7 @@ class AsyncWebCrawler:
|
|||||||
if cached_result:
|
if cached_result:
|
||||||
html = sanitize_input_encode(cached_result.html)
|
html = sanitize_input_encode(cached_result.html)
|
||||||
extracted_content = sanitize_input_encode(cached_result.extracted_content or "")
|
extracted_content = sanitize_input_encode(cached_result.extracted_content or "")
|
||||||
|
extracted_content = None if not extracted_content or extracted_content == "[]" else extracted_content
|
||||||
# If screenshot is requested but its not in cache, then set cache_result to None
|
# If screenshot is requested but its not in cache, then set cache_result to None
|
||||||
screenshot_data = cached_result.screenshot
|
screenshot_data = cached_result.screenshot
|
||||||
pdf_data = cached_result.pdf
|
pdf_data = cached_result.pdf
|
||||||
@@ -476,7 +477,7 @@ class AsyncWebCrawler:
|
|||||||
t1 = time.perf_counter()
|
t1 = time.perf_counter()
|
||||||
|
|
||||||
# Handle different extraction strategy types
|
# Handle different extraction strategy types
|
||||||
if isinstance(config.extraction_strategy, (JsonCssExtractionStrategy, JsonCssExtractionStrategy)):
|
if isinstance(config.extraction_strategy, (JsonCssExtractionStrategy, JsonXPathExtractionStrategy)):
|
||||||
config.extraction_strategy.verbose = verbose
|
config.extraction_strategy.verbose = verbose
|
||||||
extracted_content = config.extraction_strategy.run(url, [html])
|
extracted_content = config.extraction_strategy.run(url, [html])
|
||||||
extracted_content = json.dumps(extracted_content, indent=4, default=str, ensure_ascii=False)
|
extracted_content = json.dumps(extracted_content, indent=4, default=str, ensure_ascii=False)
|
||||||
|
|||||||
@@ -10,7 +10,9 @@ from functools import partial
|
|||||||
from .model_loader import *
|
from .model_loader import *
|
||||||
import math
|
import math
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from lxml import etree
|
import re
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
from lxml import html, etree
|
||||||
|
|
||||||
class ExtractionStrategy(ABC):
|
class ExtractionStrategy(ABC):
|
||||||
"""
|
"""
|
||||||
@@ -741,28 +743,15 @@ class JsonCssExtractionStrategy(ExtractionStrategy):
|
|||||||
def run(self, url: str, sections: List[str], *q, **kwargs) -> List[Dict[str, Any]]:
|
def run(self, url: str, sections: List[str], *q, **kwargs) -> List[Dict[str, Any]]:
|
||||||
combined_html = self.DEL.join(sections)
|
combined_html = self.DEL.join(sections)
|
||||||
return self.extract(url, combined_html, **kwargs)
|
return self.extract(url, combined_html, **kwargs)
|
||||||
|
class JsonXPathExtractionStrategy(ExtractionStrategy):
|
||||||
class JsonXPATHExtractionStrategy(ExtractionStrategy):
|
|
||||||
def __init__(self, schema: Dict[str, Any], **kwargs):
|
def __init__(self, schema: Dict[str, Any], **kwargs):
|
||||||
super().__init__(**kwargs)
|
super().__init__(**kwargs)
|
||||||
self.schema = schema
|
self.schema = schema
|
||||||
self.use_cssselect = self._check_cssselect()
|
|
||||||
|
|
||||||
def _check_cssselect(self):
|
def extract(self, url: str, html_content: str, *q, **kwargs) -> List[Dict[str, Any]]:
|
||||||
try:
|
tree = html.fromstring(html_content)
|
||||||
import cssselect
|
base_xpath = self.schema['baseSelector']
|
||||||
return True
|
base_elements = tree.xpath(base_xpath)
|
||||||
except ImportError:
|
|
||||||
print("Warning: cssselect is not installed. Falling back to XPath for all selectors.")
|
|
||||||
return False
|
|
||||||
|
|
||||||
def extract(self, url: str, html: str, *q, **kwargs) -> List[Dict[str, Any]]:
|
|
||||||
self.soup = BeautifulSoup(html, 'lxml')
|
|
||||||
self.tree = etree.HTML(str(self.soup))
|
|
||||||
|
|
||||||
selector_type = 'xpath' if not self.use_cssselect else self.schema.get('selectorType', 'css')
|
|
||||||
base_selector = self.schema.get('baseXPath' if selector_type == 'xpath' else 'baseSelector')
|
|
||||||
base_elements = self._select_elements(base_selector, selector_type)
|
|
||||||
|
|
||||||
results = []
|
results = []
|
||||||
for element in base_elements:
|
for element in base_elements:
|
||||||
@@ -772,27 +761,40 @@ class JsonXPATHExtractionStrategy(ExtractionStrategy):
|
|||||||
|
|
||||||
return results
|
return results
|
||||||
|
|
||||||
def _select_elements(self, selector, selector_type, element=None):
|
def _css_to_xpath(self, css_selector: str) -> str:
|
||||||
if selector_type == 'xpath' or not self.use_cssselect:
|
"""Convert CSS selector to XPath if needed"""
|
||||||
return self.tree.xpath(selector) if element is None else element.xpath(selector)
|
if '/' in css_selector: # Already an XPath
|
||||||
else: # CSS
|
return css_selector
|
||||||
return self.tree.cssselect(selector) if element is None else element.cssselect(selector)
|
else:
|
||||||
|
# Fallback to basic conversion for common cases
|
||||||
|
return self._basic_css_to_xpath(css_selector)
|
||||||
|
|
||||||
|
def _basic_css_to_xpath(self, css_selector: str) -> str:
|
||||||
|
"""Basic CSS to XPath conversion for common cases"""
|
||||||
|
# Handle basic cases
|
||||||
|
if ' > ' in css_selector:
|
||||||
|
parts = css_selector.split(' > ')
|
||||||
|
return '//' + '/'.join(parts)
|
||||||
|
if ' ' in css_selector:
|
||||||
|
parts = css_selector.split(' ')
|
||||||
|
return '//' + '//'.join(parts)
|
||||||
|
return '//' + css_selector
|
||||||
|
|
||||||
def _extract_field(self, element, field):
|
def _extract_field(self, element, field):
|
||||||
try:
|
try:
|
||||||
selector_type = 'xpath' if not self.use_cssselect else field.get('selectorType', 'css')
|
|
||||||
selector = field.get('xpathSelector' if selector_type == 'xpath' else 'selector')
|
|
||||||
|
|
||||||
if field['type'] == 'nested':
|
if field['type'] == 'nested':
|
||||||
nested_element = self._select_elements(selector, selector_type, element)
|
xpath = self._css_to_xpath(field['selector'])
|
||||||
return self._extract_item(nested_element[0], field['fields']) if nested_element else {}
|
nested_element = element.xpath(xpath)[0] if element.xpath(xpath) else None
|
||||||
|
return self._extract_item(nested_element, field['fields']) if nested_element is not None else {}
|
||||||
|
|
||||||
if field['type'] == 'list':
|
if field['type'] == 'list':
|
||||||
elements = self._select_elements(selector, selector_type, element)
|
xpath = self._css_to_xpath(field['selector'])
|
||||||
|
elements = element.xpath(xpath)
|
||||||
return [self._extract_list_item(el, field['fields']) for el in elements]
|
return [self._extract_list_item(el, field['fields']) for el in elements]
|
||||||
|
|
||||||
if field['type'] == 'nested_list':
|
if field['type'] == 'nested_list':
|
||||||
elements = self._select_elements(selector, selector_type, element)
|
xpath = self._css_to_xpath(field['selector'])
|
||||||
|
elements = element.xpath(xpath)
|
||||||
return [self._extract_item(el, field['fields']) for el in elements]
|
return [self._extract_item(el, field['fields']) for el in elements]
|
||||||
|
|
||||||
return self._extract_single_field(element, field)
|
return self._extract_single_field(element, field)
|
||||||
@@ -810,10 +812,9 @@ class JsonXPATHExtractionStrategy(ExtractionStrategy):
|
|||||||
return item
|
return item
|
||||||
|
|
||||||
def _extract_single_field(self, element, field):
|
def _extract_single_field(self, element, field):
|
||||||
selector_type = field.get('selectorType', 'css')
|
|
||||||
|
|
||||||
if 'selector' in field:
|
if 'selector' in field:
|
||||||
selected = self._select_elements(field['selector'], selector_type, element)
|
xpath = self._css_to_xpath(field['selector'])
|
||||||
|
selected = element.xpath(xpath)
|
||||||
if not selected:
|
if not selected:
|
||||||
return field.get('default')
|
return field.get('default')
|
||||||
selected = selected[0]
|
selected = selected[0]
|
||||||
@@ -822,13 +823,13 @@ class JsonXPATHExtractionStrategy(ExtractionStrategy):
|
|||||||
|
|
||||||
value = None
|
value = None
|
||||||
if field['type'] == 'text':
|
if field['type'] == 'text':
|
||||||
value = selected.text_content().strip() if hasattr(selected, 'text_content') else selected.text.strip()
|
value = ''.join(selected.xpath('.//text()')).strip()
|
||||||
elif field['type'] == 'attribute':
|
elif field['type'] == 'attribute':
|
||||||
value = selected.get(field['attribute'])
|
value = selected.get(field['attribute'])
|
||||||
elif field['type'] == 'html':
|
elif field['type'] == 'html':
|
||||||
value = etree.tostring(selected, encoding='unicode')
|
value = etree.tostring(selected, encoding='unicode')
|
||||||
elif field['type'] == 'regex':
|
elif field['type'] == 'regex':
|
||||||
text = selected.text_content().strip() if hasattr(selected, 'text_content') else selected.text.strip()
|
text = ''.join(selected.xpath('.//text()')).strip()
|
||||||
match = re.search(field['pattern'], text)
|
match = re.search(field['pattern'], text)
|
||||||
value = match.group(1) if match else None
|
value = match.group(1) if match else None
|
||||||
|
|
||||||
@@ -870,4 +871,4 @@ class JsonXPATHExtractionStrategy(ExtractionStrategy):
|
|||||||
|
|
||||||
def run(self, url: str, sections: List[str], *q, **kwargs) -> List[Dict[str, Any]]:
|
def run(self, url: str, sections: List[str], *q, **kwargs) -> List[Dict[str, Any]]:
|
||||||
combined_html = self.DEL.join(sections)
|
combined_html = self.DEL.join(sections)
|
||||||
return self.extract(url, combined_html, **kwargs)
|
return self.extract(url, combined_html, **kwargs)
|
||||||
|
|||||||
Reference in New Issue
Block a user