fix(extraction): JsonCss selector and crawler improvements

- Fix JsonCssExtractionStrategy._get_elements to return all matching elements instead of just one
- Add robust error handling to page_need_scroll with default fallback
- Improve JSON extraction strategies documentation
- Refactor content scraping strategy
- Update version to 0.4.247
This commit is contained in:
UncleCode
2025-01-05 19:26:46 +08:00
parent 0857c7b448
commit 72fbdac467
6 changed files with 56 additions and 102 deletions

View File

@@ -974,8 +974,7 @@ class JsonCssExtractionStrategy(JsonElementExtractionStrategy):
return parsed_html.select(selector)
def _get_elements(self, element, selector: str):
selected = element.select_one(selector)
return [selected] if selected else []
return element.select(selector)
def _get_element_text(self, element) -> str:
return element.get_text(strip=True)
@@ -1050,3 +1049,4 @@ class JsonXPathExtractionStrategy(JsonElementExtractionStrategy):
def _get_element_attribute(self, element, attribute: str):
return element.get(attribute)