Enhance crawler capabilities and documentation

- Add llm.txt generator - Added SSL certificate extraction in AsyncWebCrawler. - Introduced new content filters and chunking strategies for more robust data extraction. - Updated documentation.
2024-12-25 21:34:31 +08:00
parent 84b311760f
commit d5ed451299
59 changed files with 2208 additions and 1763 deletions
--- a/examples/save_certificate.py
+++ b/examples/save_certificate.py
@@ -0,0 +1,49 @@
+"""Example script showing how to save SSL certificates."""
+
+import asyncio
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
+from crawl4ai.utilities.cert_exporter import CertificateExporter
+
+# Get location of parent folder, then "tmp" folder if not make it
+import os
+parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+os.makedirs(os.path.join(parent_dir, "tmp"), exist_ok=True)
+__tmp_dir__ = os.path.join(parent_dir, "tmp")
+
+async def main():
+    # Configure crawler to fetch SSL certificate
+    crawl_config = CrawlerRunConfig(
+        fetch_ssl_certificate=True,
+        cache_mode=CacheMode.BYPASS
+    )
+
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(
+            url='https://example.com',
+            config=crawl_config
+        )
+        
+        if result.success and result.ssl_certificate:
+            # 1. Save as JSON (most readable format)
+            CertificateExporter.to_json(
+                result.ssl_certificate,
+                filepath=os.path.join(__tmp_dir__, "certificate.json")
+            )
+            print("Certificate saved in JSON format: certificate.json")
+            
+            # 2. Save as PEM (standard format for web servers)
+            pem_data = CertificateExporter.to_pem(
+                result.ssl_certificate,
+                filepath=os.path.join(__tmp_dir__, "certificate.pem")
+            )
+            print("Certificate saved in PEM format: certificate.pem")
+            
+            # Print basic certificate info
+            cert = result.ssl_certificate
+            print("\nCertificate Information:")
+            print(f"Issuer: {cert['issuer'].get(b'CN', '').decode()}")
+            print(f"Valid until: {cert['not_after']}")
+            print(f"Fingerprint: {cert['fingerprint']}")
+
+if __name__ == "__main__":
+    asyncio.run(main())
--- a/examples/ssl_certificate_example.py
+++ b/examples/ssl_certificate_example.py
@@ -0,0 +1,67 @@
+"""Example script demonstrating SSL certificate retrieval and export."""
+
+import asyncio
+import os
+from pathlib import Path
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
+from crawl4ai.utilities.cert_exporter import CertificateExporter
+
+async def main():
+    # Configure crawler to fetch SSL certificate
+    crawl_config = CrawlerRunConfig(
+        fetch_ssl_certificate=True,
+        cache_mode=CacheMode.BYPASS
+    )
+
+    # Create output directory for certificates
+    output_dir = Path("certificates")
+    output_dir.mkdir(exist_ok=True)
+
+    async with AsyncWebCrawler() as crawler:
+        # Crawl a website
+        result = await crawler.arun(
+            url='https://example.com',
+            config=crawl_config
+        )
+        
+        if result.success and result.ssl_certificate:
+            # 1. Export as JSON (human-readable format)
+            json_data = CertificateExporter.to_json(
+                result.ssl_certificate,
+                filepath=str(output_dir / "cert.json")
+            )
+            
+            # 2. Export as PEM (standard text format, used by Apache/Nginx)
+            pem_data = CertificateExporter.to_pem(
+                result.ssl_certificate,
+                filepath=str(output_dir / "cert.pem")
+            )
+            
+            # 3. Export as DER (binary format, used by Java)
+            der_data = CertificateExporter.to_der(
+                result.ssl_certificate,
+                filepath=str(output_dir / "cert.der")
+            )
+            
+            # 4. Export all formats at once
+            export_paths = CertificateExporter.export_all(
+                result.ssl_certificate,
+                str(output_dir),
+                "certificate"
+            )
+            
+            print("Certificate exported in multiple formats:")
+            for fmt, path in export_paths.items():
+                print(f"- {fmt.upper()}: {path}")
+            
+            # Print some certificate information
+            cert = result.ssl_certificate
+            print("\nCertificate Information:")
+            print(f"Subject: {cert['subject']}")
+            print(f"Issuer: {cert['issuer']}")
+            print(f"Valid from: {cert['not_before']}")
+            print(f"Valid until: {cert['not_after']}")
+            print(f"Fingerprint: {cert['fingerprint']}")
+
+if __name__ == "__main__":
+    asyncio.run(main())