refactor(crawler): improve HTML handling and cleanup codebase

- Add HTML attribute preservation in GoogleSearchCrawler
- Fix lxml import references in utils.py
- Remove unused ssl_certificate.json
- Clean up imports and code organization in hub.py
- Update test case formatting and remove unused image search test

BREAKING CHANGE: Removed ssl_certificate.json file which might affect existing certificate validations
This commit is contained in:
UncleCode
2025-02-07 21:56:27 +08:00
parent 91073c1244
commit b957ff2ecd
5 changed files with 17 additions and 79 deletions

View File

@@ -17,12 +17,16 @@ async def google_example():
crawler = crawler_cls()
# Text search
text_results = await crawler.run(query="apple inc", search_type="text", schema_cache_path="/Users/unclecode/.crawl4ai")
print(json.loads(text_results))
text_results = await crawler.run(
query="apple inc",
search_type="text",
schema_cache_path="/Users/unclecode/.crawl4ai"
)
print(json.dumps(json.loads(text_results), indent=4))
# Image search
image_results = await crawler.run(query="apple inc", search_type="image")
print(image_results)
# image_results = await crawler.run(query="apple inc", search_type="image")
# print(image_results)
if __name__ == "__main__":
import asyncio