Add Async Version, JsonCss Extrator

This commit is contained in:
unclecode
2024-09-03 01:27:00 +08:00
parent 3116f95c1a
commit c37614cbc8
17 changed files with 1922 additions and 2 deletions

View File

@@ -623,3 +623,158 @@ class ContentSummarizationStrategy(ExtractionStrategy):
# Sort summaries by the original section index to maintain order
summaries.sort(key=lambda x: x[0])
return [summary for _, summary in summaries]
class JsonCssExtractionStrategy(ExtractionStrategy):
def __init__(self, schema: Dict[str, Any], **kwargs):
super().__init__(**kwargs)
self.schema = schema
def extract(self, url: str, html: str, *q, **kwargs) -> List[Dict[str, Any]]:
soup = BeautifulSoup(html, 'html.parser')
base_elements = soup.select(self.schema['baseSelector'])
results = []
for element in base_elements:
item = {}
for field in self.schema['fields']:
value = self._extract_field(element, field)
if value is not None:
item[field['name']] = value
if item:
results.append(item)
return results
def _extract_field(self, element, field):
try:
selected = element.select_one(field['selector'])
if not selected:
return None
if field['type'] == 'text':
return selected.get_text(strip=True)
elif field['type'] == 'attribute':
return selected.get(field['attribute'])
elif field['type'] == 'html':
return str(selected)
elif field['type'] == 'regex':
text = selected.get_text(strip=True)
match = re.search(field['pattern'], text)
return match.group(1) if match else None
except Exception as e:
if self.verbose:
print(f"Error extracting field {field['name']}: {str(e)}")
return None
def run(self, url: str, sections: List[str], *q, **kwargs) -> List[Dict[str, Any]]:
combined_html = self.DEL.join(sections)
return self.extract(url, combined_html, **kwargs)
class EnhancedJsonCssExtractionStrategy(ExtractionStrategy):
def __init__(self, schema: Dict[str, Any], **kwargs):
super().__init__(**kwargs)
self.schema = schema
def extract(self, url: str, html: str, *q, **kwargs) -> List[Dict[str, Any]]:
soup = BeautifulSoup(html, 'html.parser')
base_elements = soup.select(self.schema['baseSelector'])
results = []
for element in base_elements:
item = self._extract_item(element, self.schema['fields'])
if item:
results.append(item)
return results
def _extract_field(self, element, field):
try:
if field['type'] == 'nested':
nested_element = element.select_one(field['selector'])
return self._extract_item(nested_element, field['fields']) if nested_element else {}
if field['type'] == 'list':
elements = element.select(field['selector'])
return [self._extract_list_item(el, field['fields']) for el in elements]
if field['type'] == 'nested_list':
elements = element.select(field['selector'])
return [self._extract_item(el, field['fields']) for el in elements]
return self._extract_single_field(element, field)
except Exception as e:
if self.verbose:
print(f"Error extracting field {field['name']}: {str(e)}")
return field.get('default')
def _extract_list_item(self, element, fields):
item = {}
for field in fields:
value = self._extract_single_field(element, field)
if value is not None:
item[field['name']] = value
return item
def _extract_single_field(self, element, field):
if 'selector' in field:
selected = element.select_one(field['selector'])
if not selected:
return field.get('default')
else:
selected = element
value = None
if field['type'] == 'text':
value = selected.get_text(strip=True)
elif field['type'] == 'attribute':
value = selected.get(field['attribute'])
elif field['type'] == 'html':
value = str(selected)
elif field['type'] == 'regex':
text = selected.get_text(strip=True)
match = re.search(field['pattern'], text)
value = match.group(1) if match else None
if 'transform' in field:
value = self._apply_transform(value, field['transform'])
return value if value is not None else field.get('default')
def _extract_item(self, element, fields):
item = {}
for field in fields:
if field['type'] == 'computed':
value = self._compute_field(item, field)
else:
value = self._extract_field(element, field)
if value is not None:
item[field['name']] = value
return item
def _apply_transform(self, value, transform):
if transform == 'lowercase':
return value.lower()
elif transform == 'uppercase':
return value.upper()
elif transform == 'strip':
return value.strip()
return value
def _compute_field(self, item, field):
try:
if 'expression' in field:
return eval(field['expression'], {}, item)
elif 'function' in field:
return field['function'](item)
except Exception as e:
if self.verbose:
print(f"Error computing field {field['name']}: {str(e)}")
return field.get('default')
def run(self, url: str, sections: List[str], *q, **kwargs) -> List[Dict[str, Any]]:
combined_html = self.DEL.join(sections)
return self.extract(url, combined_html, **kwargs)