Enhance crawler capabilities and documentation

- Add llm.txt generator
  - Added SSL certificate extraction in AsyncWebCrawler.
  - Introduced new content filters and chunking strategies for more robust data extraction.
  - Updated documentation.
This commit is contained in:
UncleCode
2024-12-25 21:34:31 +08:00
parent 84b311760f
commit d5ed451299
59 changed files with 2208 additions and 1763 deletions

View File

@@ -0,0 +1,49 @@
"""Example script showing how to save SSL certificates."""
import asyncio
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
from crawl4ai.utilities.cert_exporter import CertificateExporter
# Get location of parent folder, then "tmp" folder if not make it
import os
parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
os.makedirs(os.path.join(parent_dir, "tmp"), exist_ok=True)
__tmp_dir__ = os.path.join(parent_dir, "tmp")
async def main():
# Configure crawler to fetch SSL certificate
crawl_config = CrawlerRunConfig(
fetch_ssl_certificate=True,
cache_mode=CacheMode.BYPASS
)
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(
url='https://example.com',
config=crawl_config
)
if result.success and result.ssl_certificate:
# 1. Save as JSON (most readable format)
CertificateExporter.to_json(
result.ssl_certificate,
filepath=os.path.join(__tmp_dir__, "certificate.json")
)
print("Certificate saved in JSON format: certificate.json")
# 2. Save as PEM (standard format for web servers)
pem_data = CertificateExporter.to_pem(
result.ssl_certificate,
filepath=os.path.join(__tmp_dir__, "certificate.pem")
)
print("Certificate saved in PEM format: certificate.pem")
# Print basic certificate info
cert = result.ssl_certificate
print("\nCertificate Information:")
print(f"Issuer: {cert['issuer'].get(b'CN', '').decode()}")
print(f"Valid until: {cert['not_after']}")
print(f"Fingerprint: {cert['fingerprint']}")
if __name__ == "__main__":
asyncio.run(main())