Update 0.4.24 walkthrough

2024-12-31 21:01:46 +08:00
parent bd71f7f4ea
commit 19b0a5ae82
3 changed files with 152 additions and 36 deletions
--- a/crawl4ai/content_filter_strategy.py
+++ b/crawl4ai/content_filter_strategy.py
@@ -83,7 +83,6 @@ class RelevantContentFilter(ABC):
                                
        return ' '.join(filter(None, query_parts))

-
    def extract_text_chunks(self, body: Tag, min_word_threshold: int = None) -> List[Tuple[str, str]]:
        """
        Extracts text chunks from a BeautifulSoup body element while preserving order.
--- a/docs/examples/v0_4_24_walkthrough.py
+++ b/docs/examples/v0_4_24_walkthrough.py
@@ -9,6 +9,7 @@ Each section includes detailed examples and explanations of the new capabilities
 import asyncio
 import os
 import json
+import re
 from typing import List, Optional, Dict, Any
 from pydantic import BaseModel, Field
 from crawl4ai import (
@@ -18,7 +19,9 @@ from crawl4ai import (
    CacheMode,
    LLMExtractionStrategy
 )
-from crawl4ai.content_filter_strategy import PruningContentFilter
+from crawl4ai.content_filter_strategy import RelevantContentFilter
+from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator 
+from bs4 import BeautifulSoup

 # Sample HTML for demonstrations
 SAMPLE_HTML = """
@@ -68,10 +71,7 @@ async def demo_ssl_features():
    print("\n1. Enhanced SSL & Security Demo")
    print("--------------------------------")

-    browser_config = BrowserConfig(
-        ignore_https_errors=True,
-        verbose=True
-    )
+    browser_config = BrowserConfig()

    run_config = CrawlerRunConfig(
        cache_mode=CacheMode.BYPASS,
@@ -84,38 +84,91 @@ async def demo_ssl_features():
            config=run_config
        )
        print(f"SSL Crawl Success: {result.success}")
+        result.ssl_certificate.to_json(
+            os.path.join(os.getcwd(), "ssl_certificate.json")
+        )
        if not result.success:
            print(f"SSL Error: {result.error_message}")

 async def demo_content_filtering():
    """
    Smart Content Filtering Demo
-    --------------------------
+    ----------------------
    
-    Demonstrates the new content filtering system with:
-    1. Regular expression pattern matching
-    2. Length-based filtering
-    3. Custom filtering rules
-    4. Content chunking strategies
-    
-    This is particularly useful for:
-    - Removing advertisements and boilerplate content
-    - Extracting meaningful paragraphs
-    - Filtering out irrelevant sections
-    - Processing content in manageable chunks
+    Demonstrates advanced content filtering capabilities:
+    1. Custom filter to identify and extract specific content
+    2. Integration with markdown generation
+    3. Flexible pruning rules
    """
    print("\n2. Smart Content Filtering Demo")
    print("--------------------------------")

-    content_filter = PruningContentFilter(
-        min_word_threshold=50,
-        threshold_type='dynamic',
-        threshold=0.5
+    # Create a custom content filter
+    class CustomNewsFilter(RelevantContentFilter):
+        def __init__(self):
+            super().__init__()
+            # Add news-specific patterns
+            self.negative_patterns = re.compile(
+                r'nav|footer|header|sidebar|ads|comment|share|related|recommended|popular|trending',
+                re.I
+            )
+            self.min_word_count = 30  # Higher threshold for news content
+
+        def filter_content(self, html: str, min_word_threshold: int = None) -> List[str]:
+            """
+            Implements news-specific content filtering logic.
+            
+            Args:
+                html (str): HTML content to be filtered
+                min_word_threshold (int, optional): Minimum word count threshold
+                
+            Returns:
+                List[str]: List of filtered HTML content blocks
+            """
+            if not html or not isinstance(html, str):
+                return []
+                
+            soup = BeautifulSoup(html, 'lxml')
+            if not soup.body:
+                soup = BeautifulSoup(f'<body>{html}</body>', 'lxml')
+            
+            body = soup.find('body')
+            
+            # Extract chunks with metadata
+            chunks = self.extract_text_chunks(body, min_word_threshold or self.min_word_count)
+            
+            # Filter chunks based on news-specific criteria
+            filtered_chunks = []
+            for _, text, tag_type, element in chunks:
+                # Skip if element has negative class/id
+                if self.is_excluded(element):
+                    continue
+                    
+                # Headers are important in news articles
+                if tag_type == 'header':
+                    filtered_chunks.append(self.clean_element(element))
+                    continue
+                    
+                # For content, check word count and link density
+                text = element.get_text(strip=True)
+                if len(text.split()) >= (min_word_threshold or self.min_word_count):
+                    # Calculate link density
+                    links_text = ' '.join(a.get_text(strip=True) for a in element.find_all('a'))
+                    link_density = len(links_text) / len(text) if text else 1
+                    
+                    # Accept if link density is reasonable
+                    if link_density < 0.5:
+                        filtered_chunks.append(self.clean_element(element))
+            
+            return filtered_chunks
+
+    # Create markdown generator with custom filter
+    markdown_gen = DefaultMarkdownGenerator(
+        content_filter=CustomNewsFilter()
    )

    run_config = CrawlerRunConfig(
-        content_filter=content_filter,
-        cache_mode=CacheMode.BYPASS
+        markdown_generator=markdown_gen
    )

    async with AsyncWebCrawler() as crawler:
@@ -124,25 +177,22 @@ async def demo_content_filtering():
            config=run_config
        )
        print("Filtered Content Sample:")
-        print(result.markdown[:500] + "...\n")
+        print(result.markdown[:500])  # Show first 500 chars

 async def demo_json_extraction():
    """
-    Advanced JSON Extraction Demo
+    Improved JSON Extraction Demo
    ---------------------------
    
    Demonstrates the enhanced JSON extraction capabilities:
-    1. Using different input formats (markdown, html)
-    2. Base element attributes extraction
-    3. Complex nested structures
-    4. Multiple extraction patterns
+    1. Base element attributes extraction
+    2. Complex nested structures
+    3. Multiple extraction patterns
    
    Key features shown:
-    - Extracting from different input formats (markdown vs html)
    - Extracting attributes from base elements (href, data-* attributes)
    - Processing repeated patterns
    - Handling optional fields
-    - Computing derived values
    """
    print("\n3. Improved JSON Extraction Demo")
    print("--------------------------------")
@@ -152,13 +202,17 @@ async def demo_json_extraction():
        schema={
            "name": "Blog Posts",
            "baseSelector": "div.article-list",
+            "baseFields": [
+                {"name": "list_id", "type": "attribute", "attribute": "data-list-id"},
+                {"name": "category", "type": "attribute", "attribute": "data-category"}
+            ],
            "fields": [
                {
                    "name": "posts",
                    "selector": "article.post",
                    "type": "nested_list",
                    "baseFields": [
-                        {"name": "category", "type": "attribute", "attribute": "data-category"},
+                        {"name": "post_id", "type": "attribute", "attribute": "data-post-id"},
                        {"name": "author_id", "type": "attribute", "attribute": "data-author"}
                    ],
                    "fields": [
@@ -378,9 +432,9 @@ async def main():
    print("====================================")

    # Run all demos
-    # await demo_ssl_features()
-    # await demo_content_filtering()
-    # await demo_json_extraction()
+    await demo_ssl_features()
+    await demo_content_filtering()
+    await demo_json_extraction()
    await demo_input_formats()

 if __name__ == "__main__":
--- a/ssl_certificate.json
+++ b/ssl_certificate.json
@@ -0,0 +1,63 @@
+{
+  "subject": {
+    "C": "US",
+    "ST": "California",
+    "L": "Los Angeles",
+    "O": "Internet Corporation for Assigned Names and Numbers",
+    "CN": "www.example.org"
+  },
+  "issuer": {
+    "C": "US",
+    "O": "DigiCert Inc",
+    "CN": "DigiCert Global G2 TLS RSA SHA256 2020 CA1"
+  },
+  "version": 2,
+  "serial_number": "0x75bcef30689c8addf13e51af4afe187",
+  "not_before": "20240130000000Z",
+  "not_after": "20250301235959Z",
+  "fingerprint": "45463a42413a32363a44383a43313a43453a33373a37393a41433a37373a36333a30413a39303a46383a32313a36333a41333a44363a38393a32453a44363a41463a45453a34303a38363a37323a43463a31393a45423a41373a41333a3632",
+  "signature_algorithm": "sha256WithRSAEncryption",
+  "raw_cert": "MIIHbjCCBlagAwIBAgIQB1vO8waJyK3fE+Ua9K/hhzANBgkqhkiG9w0BAQsFADBZMQswCQYDVQQGEwJVUzEVMBMGA1UEChMMRGlnaUNlcnQgSW5jMTMwMQYDVQQDEypEaWdpQ2VydCBHbG9iYWwgRzIgVExTIFJTQSBTSEEyNTYgMjAyMCBDQTEwHhcNMjQwMTMwMDAwMDAwWhcNMjUwMzAxMjM1OTU5WjCBljELMAkGA1UEBhMCVVMxEzARBgNVBAgTCkNhbGlmb3JuaWExFDASBgNVBAcTC0xvcyBBbmdlbGVzMUIwQAYDVQQKDDlJbnRlcm5ldMKgQ29ycG9yYXRpb27CoGZvcsKgQXNzaWduZWTCoE5hbWVzwqBhbmTCoE51bWJlcnMxGDAWBgNVBAMTD3d3dy5leGFtcGxlLm9yZzCCASIwDQYJKoZIhvcNAQEBBQADggEPADCCAQoCggEBAIaFD7sO+cpf2fXgCjIsM9mqDgcpqC8IrXi9wga/9y0rpqcnPVOmTMNLsid3INbBVEm4CNr5cKlh9rJJnWlX2vttJDRyLkfwBD+dsVvivGYxWTLmqX6/1LDUZPVrynv/cltemtg/1Aay88jcj2ZaRoRmqBgVeacIzgU8+zmJ7236TnFSe7fkoKSclsBhPaQKcE3Djs1uszJs8sdECQTdoFX9I6UgeLKFXtg7rRf/hcW5dI0zubhXbrW8aWXbCzySVZn0c7RkJMpnTCiZzNxnPXnHFpwr5quqqjVyN/aBKkjoP04Zmr+eRqoyk/+lslq0sS8eaYSSHbC5ja/yMWyVhvMCAwEAAaOCA/IwggPuMB8GA1UdIwQYMBaAFHSFgMBmx9833s+9KTeqAx2+7c0XMB0GA1UdDgQWBBRM/tASTS4hz2v68vK4TEkCHTGRijCBgQYDVR0RBHoweIIPd3d3LmV4YW1wbGUub3JnggtleGFtcGxlLm5ldIILZXhhbXBsZS5lZHWCC2V4YW1wbGUuY29tggtleGFtcGxlLm9yZ4IPd3d3LmV4YW1wbGUuY29tgg93d3cuZXhhbXBsZS5lZHWCD3d3dy5leGFtcGxlLm5ldDA+BgNVHSAENzA1MDMGBmeBDAECAjApMCcGCCsGAQUFBwIBFhtodHRwOi8vd3d3LmRpZ2ljZXJ0LmNvbS9DUFMwDgYDVR0PAQH/BAQDAgWgMB0GA1UdJQQWMBQGCCsGAQUFBwMBBggrBgEFBQcDAjCBnwYDVR0fBIGXMIGUMEigRqBEhkJodHRwOi8vY3JsMy5kaWdpY2VydC5jb20vRGlnaUNlcnRHbG9iYWxHMlRMU1JTQVNIQTI1NjIwMjBDQTEtMS5jcmwwSKBGoESGQmh0dHA6Ly9jcmw0LmRpZ2ljZXJ0LmNvbS9EaWdpQ2VydEdsb2JhbEcyVExTUlNBU0hBMjU2MjAyMENBMS0xLmNybDCBhwYIKwYBBQUHAQEEezB5MCQGCCsGAQUFBzABhhhodHRwOi8vb2NzcC5kaWdpY2VydC5jb20wUQYIKwYBBQUHMAKGRWh0dHA6Ly9jYWNlcnRzLmRpZ2ljZXJ0LmNvbS9EaWdpQ2VydEdsb2JhbEcyVExTUlNBU0hBMjU2MjAyMENBMS0xLmNydDAMBgNVHRMBAf8EAjAAMIIBfQYKKwYBBAHWeQIEAgSCAW0EggFpAWcAdABOdaMnXJoQwzhbbNTfP1LrHfDgjhuNacCx+mSxYpo53wAAAY1b0vxkAAAEAwBFMEMCH0BRCgxPbBBVxhcWZ26a8JCe83P1JZ6wmv56GsVcyMACIDgpMbEo5HJITTRPnoyT4mG8cLrWjEvhchUdEcWUuk1TAHYAfVkeEuF4KnscYWd8Xv340IdcFKBOlZ65Ay/ZDowuebgAAAGNW9L8MAAABAMARzBFAiBdv5Z3pZFbfgoM3tGpCTM3ZxBMQsxBRSdTS6d8d2NAcwIhALLoCT9mTMN9OyFzIBV5MkXVLyuTf2OAzAOa7d8x2H6XAHcA5tIxY0B3jMEQQQbXcbnOwdJA9paEhvu6hzId/R43jlAAAAGNW9L8XwAABAMASDBGAiEA4Koh/VizdQU1tjZ2E2VGgWSXXkwnQmiYhmAeKcVLHeACIQD7JIGFsdGol7kss2pe4lYrCgPVc+iGZkuqnj26hqhr0TANBgkqhkiG9w0BAQsFAAOCAQEABOFuAj4N4yNG9OOWNQWTNSICC4Rd4nOG1HRP/Bsnrz7KrcPORtb6D+Jx+Q0amhO31QhIvVBYs14gY4Ypyj7MzHgm4VmPXcqLvEkxb2G9Qv9hYuEiNSQmm1fr5QAN/0AzbEbCM3cImLJ69kP5bUjfv/76KB57is8tYf9sh5ikLGKauxCM/zRIcGa3bXLDafk5S2g5Vr2hs230d/NGW1wZrE+zdGuMxfGJzJP+DAFviBfcQnFg4+1zMEKcqS87oniOyG+60RMM0MdejBD7AS43m9us96Gsun/4kufLQUTIFfnzxLutUV++3seshgefQOy5C/ayi8y1VTNmujPCxPCi6Q==",
+  "extensions": [
+    {
+      "name": "authorityKeyIdentifier",
+      "value": "74:85:80:C0:66:C7:DF:37:DE:CF:BD:29:37:AA:03:1D:BE:ED:CD:17"
+    },
+    {
+      "name": "subjectKeyIdentifier",
+      "value": "4C:FE:D0:12:4D:2E:21:CF:6B:FA:F2:F2:B8:4C:49:02:1D:31:91:8A"
+    },
+    {
+      "name": "subjectAltName",
+      "value": "DNS:www.example.org, DNS:example.net, DNS:example.edu, DNS:example.com, DNS:example.org, DNS:www.example.com, DNS:www.example.edu, DNS:www.example.net"
+    },
+    {
+      "name": "certificatePolicies",
+      "value": "Policy: 2.23.140.1.2.2\n  CPS: http://www.digicert.com/CPS"
+    },
+    {
+      "name": "keyUsage",
+      "value": "Digital Signature, Key Encipherment"
+    },
+    {
+      "name": "extendedKeyUsage",
+      "value": "TLS Web Server Authentication, TLS Web Client Authentication"
+    },
+    {
+      "name": "crlDistributionPoints",
+      "value": "Full Name:\n  URI:http://crl3.digicert.com/DigiCertGlobalG2TLSRSASHA2562020CA1-1.crl\nFull Name:\n  URI:http://crl4.digicert.com/DigiCertGlobalG2TLSRSASHA2562020CA1-1.crl"
+    },
+    {
+      "name": "authorityInfoAccess",
+      "value": "OCSP - URI:http://ocsp.digicert.com\nCA Issuers - URI:http://cacerts.digicert.com/DigiCertGlobalG2TLSRSASHA2562020CA1-1.crt"
+    },
+    {
+      "name": "basicConstraints",
+      "value": "CA:FALSE"
+    },
+    {
+      "name": "ct_precert_scts",
+      "value": "Signed Certificate Timestamp:\n    Version   : v1 (0x0)\n    Log ID    : 4E:75:A3:27:5C:9A:10:C3:38:5B:6C:D4:DF:3F:52:EB:\n                1D:F0:E0:8E:1B:8D:69:C0:B1:FA:64:B1:62:9A:39:DF\n    Timestamp : Jan 30 19:22:50.340 2024 GMT\n    Extensions: none\n    Signature : ecdsa-with-SHA256\n                30:43:02:1F:40:51:0A:0C:4F:6C:10:55:C6:17:16:67:\n                6E:9A:F0:90:9E:F3:73:F5:25:9E:B0:9A:FE:7A:1A:C5:\n                5C:C8:C0:02:20:38:29:31:B1:28:E4:72:48:4D:34:4F:\n                9E:8C:93:E2:61:BC:70:BA:D6:8C:4B:E1:72:15:1D:11:\n                C5:94:BA:4D:53\nSigned Certificate Timestamp:\n    Version   : v1 (0x0)\n    Log ID    : 7D:59:1E:12:E1:78:2A:7B:1C:61:67:7C:5E:FD:F8:D0:\n                87:5C:14:A0:4E:95:9E:B9:03:2F:D9:0E:8C:2E:79:B8\n    Timestamp : Jan 30 19:22:50.288 2024 GMT\n    Extensions: none\n    Signature : ecdsa-with-SHA256\n                30:45:02:20:5D:BF:96:77:A5:91:5B:7E:0A:0C:DE:D1:\n                A9:09:33:37:67:10:4C:42:CC:41:45:27:53:4B:A7:7C:\n                77:63:40:73:02:21:00:B2:E8:09:3F:66:4C:C3:7D:3B:\n                21:73:20:15:79:32:45:D5:2F:2B:93:7F:63:80:CC:03:\n                9A:ED:DF:31:D8:7E:97\nSigned Certificate Timestamp:\n    Version   : v1 (0x0)\n    Log ID    : E6:D2:31:63:40:77:8C:C1:10:41:06:D7:71:B9:CE:C1:\n                D2:40:F6:96:84:86:FB:BA:87:32:1D:FD:1E:37:8E:50\n    Timestamp : Jan 30 19:22:50.335 2024 GMT\n    Extensions: none\n    Signature : ecdsa-with-SHA256\n                30:46:02:21:00:E0:AA:21:FD:58:B3:75:05:35:B6:36:\n                76:13:65:46:81:64:97:5E:4C:27:42:68:98:86:60:1E:\n                29:C5:4B:1D:E0:02:21:00:FB:24:81:85:B1:D1:A8:97:\n                B9:2C:B3:6A:5E:E2:56:2B:0A:03:D5:73:E8:86:66:4B:\n                AA:9E:3D:BA:86:A8:6B:D1"
+    }
+  ]
+}