Commit Message:

Enhance crawler capabilities and documentation - Added SSL certificate extraction in AsyncWebCrawler. - Introduced new content filters and chunking strategies for more robust data extraction. - Updated documentation management to streamline user experience.
2024-12-26 15:17:07 +08:00
parent d5ed451299
commit 9a4ed6bbd7
72 changed files with 14793 additions and 363 deletions
--- a/docs/examples/ssl_example.py
+++ b/docs/examples/ssl_example.py
@@ -0,0 +1,46 @@
+"""Example showing how to work with SSL certificates in Crawl4AI."""
+
+import asyncio
+import os
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
+
+# Create tmp directory if it doesn't exist
+parent_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+tmp_dir = os.path.join(parent_dir, "tmp")
+os.makedirs(tmp_dir, exist_ok=True)
+
+async def main():
+    # Configure crawler to fetch SSL certificate
+    config = CrawlerRunConfig(
+        fetch_ssl_certificate=True,
+        cache_mode=CacheMode.BYPASS  # Bypass cache to always get fresh certificates
+    )
+
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(
+            url='https://example.com',
+            config=config
+        )
+        
+        if result.success and result.ssl_certificate:
+            cert = result.ssl_certificate
+            
+            # 1. Access certificate properties directly
+            print("\nCertificate Information:")
+            print(f"Issuer: {cert.issuer.get('CN', '')}")
+            print(f"Valid until: {cert.valid_until}")
+            print(f"Fingerprint: {cert.fingerprint}")
+            
+            # 2. Export certificate in different formats
+            cert.to_json(os.path.join(tmp_dir, "certificate.json"))  # For analysis
+            print("\nCertificate exported to:")
+            print(f"- JSON: {os.path.join(tmp_dir, 'certificate.json')}")
+            
+            pem_data = cert.to_pem(os.path.join(tmp_dir, "certificate.pem"))  # For web servers
+            print(f"- PEM: {os.path.join(tmp_dir, 'certificate.pem')}")
+            
+            der_data = cert.to_der(os.path.join(tmp_dir, "certificate.der"))  # For Java apps
+            print(f"- DER: {os.path.join(tmp_dir, 'certificate.der')}")
+
+if __name__ == "__main__":
+    asyncio.run(main())
--- a/docs/llm.txt/14_proxy_security.md
+++ b/docs/llm.txt/14_proxy_security.md
@@ -93,3 +93,39 @@ crawler_config = CrawlerRunConfig(magic=True)  # Enable all anti-detection featu
 async with AsyncWebCrawler(config=browser_config) as crawler:
    result = await crawler.arun(url="https://example.com", config=crawler_config)
 ```
+
+## SSL Certificate Verification
+
+Crawl4AI can retrieve and analyze SSL certificates from HTTPS websites. This is useful for:
+- Verifying website authenticity
+- Detecting potential security issues
+- Analyzing certificate chains
+- Exporting certificates for further analysis
+
+Enable SSL certificate retrieval with `CrawlerRunConfig`:
+
+```python
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
+
+config = CrawlerRunConfig(fetch_ssl_certificate=True)
+async with AsyncWebCrawler() as crawler:
+    result = await crawler.arun(url="https://example.com", config=config)
+    
+    if result.success and result.ssl_certificate:
+        cert = result.ssl_certificate
+        
+        # Access certificate properties
+        print(f"Issuer: {cert.issuer.get('CN', '')}")
+        print(f"Valid until: {cert.valid_until}")
+        print(f"Fingerprint: {cert.fingerprint}")
+        
+        # Export certificate in different formats
+        cert.to_json("cert.json")  # For analysis
+        cert.to_pem("cert.pem")    # For web servers
+        cert.to_der("cert.der")    # For Java applications
+```
+
+The SSL certificate object provides:
+- Direct access to certificate fields (issuer, subject, validity dates)
+- Methods to export in common formats (JSON, PEM, DER)
+- Certificate chain information and extensions