Apply Ruff Corrections

2025-01-13 19:19:58 +08:00
parent c3370ec5da
commit 8ec12d7d68
84 changed files with 6861 additions and 5076 deletions
--- a/docs/examples/amazon_product_extraction_using_hooks.py
+++ b/docs/examples/amazon_product_extraction_using_hooks.py
@@ -10,17 +10,17 @@ from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
 import json
 from playwright.async_api import Page, BrowserContext

+
 async def extract_amazon_products():
    # Initialize browser config
    browser_config = BrowserConfig(
        # browser_type="chromium",
        headless=True
    )
-    
+
    # Initialize crawler config with JSON CSS extraction strategy nav-search-submit-button
    crawler_config = CrawlerRunConfig(
        cache_mode=CacheMode.BYPASS,
-
        extraction_strategy=JsonCssExtractionStrategy(
            schema={
                "name": "Amazon Product Search Results",
@@ -30,102 +30,105 @@ async def extract_amazon_products():
                        "name": "asin",
                        "selector": "",
                        "type": "attribute",
-                        "attribute": "data-asin"
-                    },
-                    {
-                        "name": "title",
-                        "selector": "h2 a span",
-                        "type": "text"
+                        "attribute": "data-asin",
                    },
+                    {"name": "title", "selector": "h2 a span", "type": "text"},
                    {
                        "name": "url",
                        "selector": "h2 a",
                        "type": "attribute",
-                        "attribute": "href"
+                        "attribute": "href",
                    },
                    {
                        "name": "image",
                        "selector": ".s-image",
                        "type": "attribute",
-                        "attribute": "src"
+                        "attribute": "src",
                    },
                    {
                        "name": "rating",
                        "selector": ".a-icon-star-small .a-icon-alt",
-                        "type": "text"
+                        "type": "text",
                    },
                    {
                        "name": "reviews_count",
                        "selector": "[data-csa-c-func-deps='aui-da-a-popover'] ~ span span",
-                        "type": "text"
+                        "type": "text",
                    },
                    {
                        "name": "price",
                        "selector": ".a-price .a-offscreen",
-                        "type": "text"
+                        "type": "text",
                    },
                    {
                        "name": "original_price",
                        "selector": ".a-price.a-text-price .a-offscreen",
-                        "type": "text"
+                        "type": "text",
                    },
                    {
                        "name": "sponsored",
                        "selector": ".puis-sponsored-label-text",
-                        "type": "exists"
+                        "type": "exists",
                    },
                    {
                        "name": "delivery_info",
                        "selector": "[data-cy='delivery-recipe'] .a-color-base",
                        "type": "text",
-                        "multiple": True
-                    }
-                ]
+                        "multiple": True,
+                    },
+                ],
            }
-        )
+        ),
    )

    url = "https://www.amazon.com/"
-    
-    async def after_goto(page: Page, context: BrowserContext, url: str, response: dict, **kwargs):
+
+    async def after_goto(
+        page: Page, context: BrowserContext, url: str, response: dict, **kwargs
+    ):
        """Hook called after navigating to each URL"""
        print(f"[HOOK] after_goto - Successfully loaded: {url}")
-        
+
        try:
            # Wait for search box to be available
-            search_box = await page.wait_for_selector('#twotabsearchtextbox', timeout=1000)
-            
+            search_box = await page.wait_for_selector(
+                "#twotabsearchtextbox", timeout=1000
+            )
+
            # Type the search query
-            await search_box.fill('Samsung Galaxy Tab')
-            
+            await search_box.fill("Samsung Galaxy Tab")
+
            # Get the search button and prepare for navigation
-            search_button = await page.wait_for_selector('#nav-search-submit-button', timeout=1000)
-            
+            search_button = await page.wait_for_selector(
+                "#nav-search-submit-button", timeout=1000
+            )
+
            # Click with navigation waiting
            await search_button.click()
-            
+
            # Wait for search results to load
-            await page.wait_for_selector('[data-component-type="s-search-result"]', timeout=10000)
+            await page.wait_for_selector(
+                '[data-component-type="s-search-result"]', timeout=10000
+            )
            print("[HOOK] Search completed and results loaded!")
-            
+
        except Exception as e:
            print(f"[HOOK] Error during search operation: {str(e)}")
-            
-        return page    
-    
+
+        return page
+
    # Use context manager for proper resource handling
    async with AsyncWebCrawler(config=browser_config) as crawler:
-        
        crawler.crawler_strategy.set_hook("after_goto", after_goto)
-        
+
        # Extract the data
        result = await crawler.arun(url=url, config=crawler_config)
-        
+
        # Process and print the results
        if result and result.extracted_content:
            # Parse the JSON string into a list of products
            products = json.loads(result.extracted_content)
-            
+
            # Process each product in the list
            for product in products:
                print("\nProduct Details:")
@@ -136,10 +139,12 @@ async def extract_amazon_products():
                print(f"Rating: {product.get('rating')}")
                print(f"Reviews: {product.get('reviews_count')}")
                print(f"Sponsored: {'Yes' if product.get('sponsored') else 'No'}")
-                if product.get('delivery_info'):
+                if product.get("delivery_info"):
                    print(f"Delivery: {' '.join(product['delivery_info'])}")
                print("-" * 80)

+
 if __name__ == "__main__":
    import asyncio
+
    asyncio.run(extract_amazon_products())