diff --git a/CHANGELOG.md b/CHANGELOG.md index b654953f..3bea14df 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,9 @@ +### [Added] 2025-01-20 +- New LLM-powered schema generation utility for JsonElementExtractionStrategy +- Support for automatic CSS and XPath schema generation using OpenAI or Ollama +- Comprehensive documentation and examples for schema generation +- New prompt templates optimized for HTML schema analysis + # Changelog All notable changes to Crawl4AI will be documented in this file. diff --git a/crawl4ai/extraction_strategy.py b/crawl4ai/extraction_strategy.py index 1e31a5cd..b2b24751 100644 --- a/crawl4ai/extraction_strategy.py +++ b/crawl4ai/extraction_strategy.py @@ -5,7 +5,7 @@ import json import time import os -from .prompts import PROMPT_EXTRACT_BLOCKS, PROMPT_EXTRACT_BLOCKS_WITH_INSTRUCTION, PROMPT_EXTRACT_SCHEMA_WITH_INSTRUCTION +from .prompts import PROMPT_EXTRACT_BLOCKS, PROMPT_EXTRACT_BLOCKS_WITH_INSTRUCTION, PROMPT_EXTRACT_SCHEMA_WITH_INSTRUCTION, JSON_SCHEMA_BUILDER_XPATH from .config import ( DEFAULT_PROVIDER, PROVIDER_MODELS, CHUNK_TOKEN_THRESHOLD, @@ -1060,6 +1060,72 @@ class JsonElementExtractionStrategy(ExtractionStrategy): """Get attribute value from element""" pass + @staticmethod + def generate_schema( + html: str, + schema_type: str = "CSS", # or XPATH + query: str = None, + provider: str = "gpt-4o", + api_token: str = os.getenv("OPENAI_API_KEY"), + **kwargs + ) -> dict: + """ + Generate extraction schema from HTML content and optional query. + + Args: + html (str): The HTML content to analyze + query (str, optional): Natural language description of what data to extract + provider (str): LLM provider to use + api_token (str): API token for LLM provider + prompt (str, optional): Custom prompt template to use + **kwargs: Additional args passed to perform_completion_with_backoff + + Returns: + dict: Generated schema following the JsonElementExtractionStrategy format + """ + from .prompts import JSON_SCHEMA_BUILDER + from .utils import perform_completion_with_backoff + + # Use default or custom prompt + prompt_template = JSON_SCHEMA_BUILDER if schema_type == "CSS" else JSON_SCHEMA_BUILDER_XPATH + + # Build the prompt + system_message = { + "role": "system", + "content": "You are a specialized HTML schema generator. Analyze the HTML and generate a JSON schema that follows the specified format. Only output valid JSON schema, nothing else." + } + + user_message = { + "role": "user", + "content": f""" + Instructions: + {prompt_template} + + HTML to analyze: + ```html + {html} + ``` + + {"Extract the following data: " + query if query else "Please analyze the HTML structure and create the most appropriate schema for data extraction."} + """ + } + + try: + # Call LLM with backoff handling + response = perform_completion_with_backoff( + provider=provider, + prompt_with_variables="\n\n".join([system_message["content"], user_message["content"]]), + json_response = True, + api_token=api_token, + **kwargs + ) + + # Extract and return schema + return json.loads(response.choices[0].message.content) + + except Exception as e: + raise Exception(f"Failed to generate schema: {str(e)}") + class JsonCssExtractionStrategy(JsonElementExtractionStrategy): """ @@ -1171,3 +1237,4 @@ class JsonXPathExtractionStrategy(JsonElementExtractionStrategy): def _get_element_attribute(self, element, attribute: str): return element.get(attribute) + diff --git a/crawl4ai/prompts.py b/crawl4ai/prompts.py index abddd64f..be5e0310 100644 --- a/crawl4ai/prompts.py +++ b/crawl4ai/prompts.py @@ -256,4 +256,754 @@ Wrap your response in tags. Use proper markdown throughout. [Your markdown content here] -Begin filtering now.""" \ No newline at end of file +Begin filtering now.""" + +JSON_SCHEMA_BUILDER= """ +# HTML Schema Generation Instructions +You are a specialized model designed to analyze HTML patterns and generate extraction schemas. Your primary job is to create structured JSON schemas that can be used to extract data from HTML in a consistent and reliable way. When presented with HTML content, you must analyze its structure and generate a schema that captures all relevant data points. + +## Your Core Responsibilities: +1. Analyze HTML structure to identify repeating patterns and important data points +2. Generate valid JSON schemas following the specified format +3. Create appropriate selectors that will work reliably for data extraction +4. Name fields meaningfully based on their content and purpose +5. Handle both specific user requests and autonomous pattern detection + +## Available Schema Types You Can Generate: + + +1. Basic Single-Level Schema + - Use for simple, flat data structures + - Example: Product cards, user profiles + - Direct field extractions + +2. Nested Object Schema + - Use for hierarchical data + - Example: Articles with author details + - Contains objects within objects + +3. List Schema + - Use for repeating elements + - Example: Comment sections, product lists + - Handles arrays of similar items + +4. Complex Nested Lists + - Use for multi-level data + - Example: Categories with subcategories + - Multiple levels of nesting + +5. Transformation Schema + - Use for data requiring processing + - Supports regex and text transformations + - Special attribute handling + + + +Your output must always be a JSON object with this structure: +{ + "name": "Descriptive name of the pattern", + "baseSelector": "CSS selector for the repeating element", + "fields": [ + { + "name": "field_name", + "selector": "CSS selector", + "type": "text|attribute|nested|list|regex", + "attribute": "attribute_name", // Optional + "transform": "transformation_type", // Optional + "pattern": "regex_pattern", // Optional + "fields": [] // For nested/list types + } + ] +} + + + +Available field types: +- text: Direct text extraction +- attribute: HTML attribute extraction +- nested: Object containing other fields +- list: Array of similar items +- regex: Pattern-based extraction + + + +1. When given a specific query: + - Focus on extracting requested data points + - Use most specific selectors possible + - Include all fields mentioned in the query + +2. When no query is provided: + - Identify main content areas + - Extract all meaningful data points + - Use semantic structure to determine importance + - Include prices, dates, titles, and other common data types + +3. Always: + - Use reliable CSS selectors + - Handle dynamic class names appropriately + - Create descriptive field names + - Follow consistent naming conventions + + + +1. Basic Product Card Example: + +
+

Gaming Laptop

+ $999.99 + Gaming Laptop +
+ + +Generated Schema: +{ + "name": "Product Cards", + "baseSelector": ".product-card", + "baseFields": [ + {"name": "data_cat_id", "type": "attribute", "attribute": "data-cat-id"}, + {"name": "data_subcat_id", "type": "attribute", "attribute": "data-subcat-id"} + ], + "fields": [ + { + "name": "title", + "selector": ".product-title", + "type": "text" + }, + { + "name": "price", + "selector": ".price", + "type": "text" + }, + { + "name": "image_url", + "selector": "img", + "type": "attribute", + "attribute": "src" + } + ] +} + +2. Article with Author Details Example: + +
+

The Future of AI

+
+ Dr. Smith + Dr. Smith +
+
+ + +Generated Schema: +{ + "name": "Article Details", + "baseSelector": "article", + "fields": [ + { + "name": "title", + "selector": "h1", + "type": "text" + }, + { + "name": "author", + "type": "nested", + "selector": ".author-info", + "fields": [ + { + "name": "name", + "selector": ".author-name", + "type": "text" + }, + { + "name": "avatar", + "selector": "img", + "type": "attribute", + "attribute": "src" + } + ] + } + ] +} + +3. Comments Section Example: + +
+
+
John123
+

Great article!

+
+
+
Alice456
+

Thanks for sharing.

+
+
+ + +Generated Schema: +{ + "name": "Comment Section", + "baseSelector": ".comments-container", + "baseFields": [ + {"name": "data_user_id", "type": "attribute", "attribute": "data-user-id"} + ], + "fields": [ + { + "name": "comments", + "type": "list", + "selector": ".comment", + "fields": [ + { + "name": "user", + "selector": ".user-name", + "type": "text" + }, + { + "name": "content", + "selector": ".comment-text", + "type": "text" + } + ] + } + ] +} + +4. E-commerce Categories Example: + +
+

Electronics

+
+

Laptops

+
+ MacBook Pro + $1299 +
+
+ Dell XPS + $999 +
+
+
+ + +Generated Schema: +{ + "name": "E-commerce Categories", + "baseSelector": ".category-section", + "baseFields": [ + {"name": "data_category", "type": "attribute", "attribute": "data-category"} + ], + "fields": [ + { + "name": "category_name", + "selector": "h2", + "type": "text" + }, + { + "name": "subcategories", + "type": "nested_list", + "selector": ".subcategory", + "fields": [ + { + "name": "name", + "selector": "h3", + "type": "text" + }, + { + "name": "products", + "type": "list", + "selector": ".product", + "fields": [ + { + "name": "name", + "selector": ".product-name", + "type": "text" + }, + { + "name": "price", + "selector": ".price", + "type": "text" + } + ] + } + ] + } + ] +} + +5. Job Listings with Transformations Example: + +
+

Senior Developer

+ Salary: $120,000/year + New York, NY +
+ + +Generated Schema: +{ + "name": "Job Listings", + "baseSelector": ".job-post", + "fields": [ + { + "name": "title", + "selector": ".job-title", + "type": "text", + "transform": "uppercase" + }, + { + "name": "salary", + "selector": ".salary-text", + "type": "regex", + "pattern": "\\$([\\d,]+)" + }, + { + "name": "location", + "selector": ".location", + "type": "text", + "transform": "strip" + } + ] +} + +6. Skyscanner Place Card Example: + +
+
+
+ Doha +
+ Qatar +
+ Sunny days and the warmest welcome awaits + +
+ ₹17,559 +
+
+
+ + +Generated Schema: +{ + "name": "Skyscanner Place Cards", + "baseSelector": "div[class^='PlaceCard_descriptionContainer__']", + "baseFields": [ + {"name": "data_testid", "type": "attribute", "attribute": "data-testid"} + ], + "fields": [ + { + "name": "city_name", + "selector": "div[class^='PlaceCard_nameContent__'] .BpkText_bpk-text--heading-4__", + "type": "text" + }, + { + "name": "country_name", + "selector": "span[class*='PlaceCard_subName__']", + "type": "text" + }, + { + "name": "description", + "selector": "span[class*='PlaceCard_advertLabel__']", + "type": "text" + }, + { + "name": "flight_price", + "selector": "a[data-testid='flights-link'] .BpkText_bpk-text--heading-5__", + "type": "text" + }, + { + "name": "flight_url", + "selector": "a[data-testid='flights-link']", + "type": "attribute", + "attribute": "href" + } + ] +} +
+ + + +Your output must: +1. Be valid JSON only +2. Include no explanatory text +3. Follow the exact schema structure provided +4. Use appropriate field types +5. Include all required fields +6. Use valid CSS selectors + + +""" + +JSON_SCHEMA_BUILDER_XPATH = """ +# HTML Schema Generation Instructions +You are a specialized model designed to analyze HTML patterns and generate extraction schemas. Your primary job is to create structured JSON schemas that can be used to extract data from HTML in a consistent and reliable way. When presented with HTML content, you must analyze its structure and generate a schema that captures all relevant data points. + +## Your Core Responsibilities: +1. Analyze HTML structure to identify repeating patterns and important data points +2. Generate valid JSON schemas following the specified format +3. Create appropriate XPath selectors that will work reliably for data extraction +4. Name fields meaningfully based on their content and purpose +5. Handle both specific user requests and autonomous pattern detection + +## Available Schema Types You Can Generate: + + +1. Basic Single-Level Schema + - Use for simple, flat data structures + - Example: Product cards, user profiles + - Direct field extractions + +2. Nested Object Schema + - Use for hierarchical data + - Example: Articles with author details + - Contains objects within objects + +3. List Schema + - Use for repeating elements + - Example: Comment sections, product lists + - Handles arrays of similar items + +4. Complex Nested Lists + - Use for multi-level data + - Example: Categories with subcategories + - Multiple levels of nesting + +5. Transformation Schema + - Use for data requiring processing + - Supports regex and text transformations + - Special attribute handling + + + +Your output must always be a JSON object with this structure: +{ + "name": "Descriptive name of the pattern", + "baseSelector": "XPath selector for the repeating element", + "fields": [ + { + "name": "field_name", + "selector": "XPath selector", + "type": "text|attribute|nested|list|regex", + "attribute": "attribute_name", // Optional + "transform": "transformation_type", // Optional + "pattern": "regex_pattern", // Optional + "fields": [] // For nested/list types + } + ] +} + + + +Available field types: +- text: Direct text extraction +- attribute: HTML attribute extraction +- nested: Object containing other fields +- list: Array of similar items +- regex: Pattern-based extraction + + + +1. When given a specific query: + - Focus on extracting requested data points + - Use most specific selectors possible + - Include all fields mentioned in the query + +2. When no query is provided: + - Identify main content areas + - Extract all meaningful data points + - Use semantic structure to determine importance + - Include prices, dates, titles, and other common data types + +3. Always: + - Use reliable XPath selectors + - Handle dynamic element IDs appropriately + - Create descriptive field names + - Follow consistent naming conventions + + + +1. Basic Product Card Example: + +
+

Gaming Laptop

+ $999.99 + Gaming Laptop +
+ + +Generated Schema: +{ + "name": "Product Cards", + "baseSelector": "//div[@class='product-card']", + "baseFields": [ + {"name": "data_cat_id", "type": "attribute", "attribute": "data-cat-id"}, + {"name": "data_subcat_id", "type": "attribute", "attribute": "data-subcat-id"} + ], + "fields": [ + { + "name": "title", + "selector": ".//h2[@class='product-title']", + "type": "text" + }, + { + "name": "price", + "selector": ".//span[@class='price']", + "type": "text" + }, + { + "name": "image_url", + "selector": ".//img", + "type": "attribute", + "attribute": "src" + } + ] +} + +2. Article with Author Details Example: + +
+

The Future of AI

+
+ Dr. Smith + Dr. Smith +
+
+ + +Generated Schema: +{ + "name": "Article Details", + "baseSelector": "//article", + "fields": [ + { + "name": "title", + "selector": ".//h1", + "type": "text" + }, + { + "name": "author", + "type": "nested", + "selector": ".//div[@class='author-info']", + "fields": [ + { + "name": "name", + "selector": ".//span[@class='author-name']", + "type": "text" + }, + { + "name": "avatar", + "selector": ".//img", + "type": "attribute", + "attribute": "src" + } + ] + } + ] +} + +3. Comments Section Example: + +
+
+
John123
+

Great article!

+
+
+
Alice456
+

Thanks for sharing.

+
+
+ + +Generated Schema: +{ + "name": "Comment Section", + "baseSelector": "//div[@class='comments-container']", + "fields": [ + { + "name": "comments", + "type": "list", + "selector": ".//div[@class='comment']", + "baseFields": [ + {"name": "data_user_id", "type": "attribute", "attribute": "data-user-id"} + ], + "fields": [ + { + "name": "user", + "selector": ".//div[@class='user-name']", + "type": "text" + }, + { + "name": "content", + "selector": ".//p[@class='comment-text']", + "type": "text" + } + ] + } + ] +} + +4. E-commerce Categories Example: + +
+

Electronics

+
+

Laptops

+
+ MacBook Pro + $1299 +
+
+ Dell XPS + $999 +
+
+
+ + +Generated Schema: +{ + "name": "E-commerce Categories", + "baseSelector": "//div[@class='category-section']", + "baseFields": [ + {"name": "data_category", "type": "attribute", "attribute": "data-category"} + ], + "fields": [ + { + "name": "category_name", + "selector": ".//h2", + "type": "text" + }, + { + "name": "subcategories", + "type": "nested_list", + "selector": ".//div[@class='subcategory']", + "fields": [ + { + "name": "name", + "selector": ".//h3", + "type": "text" + }, + { + "name": "products", + "type": "list", + "selector": ".//div[@class='product']", + "fields": [ + { + "name": "name", + "selector": ".//span[@class='product-name']", + "type": "text" + }, + { + "name": "price", + "selector": ".//span[@class='price']", + "type": "text" + } + ] + } + ] + } + ] +} + +5. Job Listings with Transformations Example: + +
+

Senior Developer

+ Salary: $120,000/year + New York, NY +
+ + +Generated Schema: +{ + "name": "Job Listings", + "baseSelector": "//div[@class='job-post']", + "fields": [ + { + "name": "title", + "selector": ".//h3[@class='job-title']", + "type": "text", + "transform": "uppercase" + }, + { + "name": "salary", + "selector": ".//span[@class='salary-text']", + "type": "regex", + "pattern": "\\$([\\d,]+)" + }, + { + "name": "location", + "selector": ".//span[@class='location']", + "type": "text", + "transform": "strip" + } + ] +} + +6. Skyscanner Place Card Example: + +
+
+
+ Doha +
+ Qatar +
+ Sunny days and the warmest welcome awaits + +
+ ₹17,559 +
+
+
+ + +Generated Schema: +{ + "name": "Skyscanner Place Cards", + "baseSelector": "//div[contains(@class, 'PlaceCard_descriptionContainer__')]", + "baseFields": [ + {"name": "data_testid", "type": "attribute", "attribute": "data-testid"} + ], + "fields": [ + { + "name": "city_name", + "selector": ".//div[contains(@class, 'PlaceCard_nameContent__')]//span[contains(@class, 'BpkText_bpk-text--heading-4__')]", + "type": "text" + }, + { + "name": "country_name", + "selector": ".//span[contains(@class, 'PlaceCard_subName__')]", + "type": "text" + }, + { + "name": "description", + "selector": ".//span[contains(@class, 'PlaceCard_advertLabel__')]", + "type": "text" + }, + { + "name": "flight_price", + "selector": ".//a[@data-testid='flights-link']//span[contains(@class, 'BpkText_bpk-text--heading-5__')]", + "type": "text" + }, + { + "name": "flight_url", + "selector": ".//a[@data-testid='flights-link']", + "type": "attribute", + "attribute": "href" + } + ] +} +
+ + +Your output must: +1. Be valid JSON only +2. Include no explanatory text +3. Follow the exact schema structure provided +4. Use appropriate field types +5. Include all required fields +6. Use valid XPath selectors + +""" \ No newline at end of file diff --git a/docs/md_v2/core/quickstart.md b/docs/md_v2/core/quickstart.md index 0fa0936b..04614533 100644 --- a/docs/md_v2/core/quickstart.md +++ b/docs/md_v2/core/quickstart.md @@ -124,6 +124,36 @@ async with AsyncWebCrawler() as crawler: Crawl4AI can also extract structured data (JSON) using CSS or XPath selectors. Below is a minimal CSS-based example: +> **New!** Crawl4AI now provides a powerful utility to automatically generate extraction schemas using LLM. This is a one-time cost that gives you a reusable schema for fast, LLM-free extractions: + +```python +from crawl4ai.extraction_strategy import JsonCssExtractionStrategy + +# Generate a schema (one-time cost) +html = "

Gaming Laptop

$999.99
" + +# Using OpenAI (requires API token) +schema = JsonCssExtractionStrategy.generate_schema( + html, + llm_provider="openai/gpt-4o", # Default provider + api_token="your-openai-token" # Required for OpenAI +) + +# Or using Ollama (open source, no token needed) +schema = JsonCssExtractionStrategy.generate_schema( + html, + llm_provider="ollama/llama3.3", # Open source alternative + api_token=None # Not needed for Ollama +) + +# Use the schema for fast, repeated extractions +strategy = JsonCssExtractionStrategy(schema) +``` + +For a complete guide on schema generation and advanced usage, see [No-LLM Extraction Strategies](../extraction/no-llm-strategies.md). + +Here's a basic extraction example: + ```python import asyncio import json diff --git a/docs/md_v2/extraction/no-llm-strategies.md b/docs/md_v2/extraction/no-llm-strategies.md index 599d73e9..97002dad 100644 --- a/docs/md_v2/extraction/no-llm-strategies.md +++ b/docs/md_v2/extraction/no-llm-strategies.md @@ -401,7 +401,92 @@ Then run with `JsonCssExtractionStrategy(schema)` to get an array of blog post o --- -## 8. Conclusion +## 8. Schema Generation Utility + +While manually crafting schemas is powerful and precise, Crawl4AI now offers a convenient utility to **automatically generate** extraction schemas using LLM. This is particularly useful when: + +1. You're dealing with a new website structure and want a quick starting point +2. You need to extract complex nested data structures +3. You want to avoid the learning curve of CSS/XPath selector syntax + +### Using the Schema Generator + +The schema generator is available as a static method on both `JsonCssExtractionStrategy` and `JsonXPathExtractionStrategy`. You can choose between OpenAI's GPT-4 or the open-source Ollama for schema generation: + +```python +from crawl4ai.extraction_strategy import JsonCssExtractionStrategy, JsonXPathExtractionStrategy + +# Sample HTML with product information +html = """ +
+

Gaming Laptop

+
$999.99
+
+ +
+
+""" + +# Option 1: Using OpenAI (requires API token) +css_schema = JsonCssExtractionStrategy.generate_schema( + html, + schema_type="css", # This is the default + llm_provider="openai/gpt-4o", # Default provider + api_token="your-openai-token" # Required for OpenAI +) + +# Option 2: Using Ollama (open source, no token needed) +xpath_schema = JsonXPathExtractionStrategy.generate_schema( + html, + schema_type="xpath", + llm_provider="ollama/llama3.3", # Open source alternative + api_token=None # Not needed for Ollama +) + +# Use the generated schema for fast, repeated extractions +strategy = JsonCssExtractionStrategy(css_schema) +``` + +### LLM Provider Options + +1. **OpenAI GPT-4 (`openai/gpt4o`)** + - Default provider + - Requires an API token + - Generally provides more accurate schemas + - Set via environment variable: `OPENAI_API_KEY` + +2. **Ollama (`ollama/llama3.3`)** + - Open source alternative + - No API token required + - Self-hosted option + - Good for development and testing + +### Benefits of Schema Generation + +1. **One-Time Cost**: While schema generation uses LLM, it's a one-time cost. The generated schema can be reused for unlimited extractions without further LLM calls. +2. **Smart Pattern Recognition**: The LLM analyzes the HTML structure and identifies common patterns, often producing more robust selectors than manual attempts. +3. **Automatic Nesting**: Complex nested structures are automatically detected and properly represented in the schema. +4. **Learning Tool**: The generated schemas serve as excellent examples for learning how to write your own schemas. + +### Best Practices + +1. **Review Generated Schemas**: While the generator is smart, always review and test the generated schema before using it in production. +2. **Provide Representative HTML**: The better your sample HTML represents the overall structure, the more accurate the generated schema will be. +3. **Consider Both CSS and XPath**: Try both schema types and choose the one that works best for your specific case. +4. **Cache Generated Schemas**: Since generation uses LLM, save successful schemas for reuse. +5. **API Token Security**: Never hardcode API tokens. Use environment variables or secure configuration management. +6. **Choose Provider Wisely**: + - Use OpenAI for production-quality schemas + - Use Ollama for development, testing, or when you need a self-hosted solution + +That's it for **Extracting JSON (No LLM)**! You've seen how schema-based approaches (either CSS or XPath) can handle everything from simple lists to deeply nested product catalogs—instantly, with minimal overhead. Enjoy building robust scrapers that produce consistent, structured JSON for your data pipelines! + +--- + +## 9. Conclusion With **JsonCssExtractionStrategy** (or **JsonXPathExtractionStrategy**), you can build powerful, **LLM-free** pipelines that: diff --git a/tests/20241401/test_schema_builder.py b/tests/20241401/test_schema_builder.py new file mode 100644 index 00000000..431fb001 --- /dev/null +++ b/tests/20241401/test_schema_builder.py @@ -0,0 +1,111 @@ +# https://claude.ai/chat/c4bbe93d-fb54-44ce-92af-76b4c8086c6b +# https://claude.ai/chat/c24a768c-d8b2-478a-acc7-d76d42a308da +import os, sys + +parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +sys.path.append(parent_dir) +__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__))) + +import asyncio +from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode +from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator +from crawl4ai.extraction_strategy import JsonCssExtractionStrategy, JsonXPathExtractionStrategy +import json + +# Test HTML - A complex job board with companies, departments, and positions +test_html = """ +
+
+
+ +

Google

+
+ 10,000+ employees + Technology + Careers Page +
+
+ +
+
+

Engineering

+
+
+

Senior Software Engineer

+ $150,000 - $250,000 +
+ Mountain View, CA + Full-time + 5+ years +
+
+ Python + Kubernetes + Machine Learning +
+

Join our core engineering team...

+
+ Posted: 2024-03-15 + +
+
+ +
+
+ +
+

Marketing

+
+
+

Growth Marketing Manager

+ $120,000 - $180,000 +
+ New York, NY + Full-time + 3+ years +
+
+ SEO + Analytics + Content Strategy +
+

Drive our growth initiatives...

+
+ Posted: 2024-03-14 + +
+
+
+
+
+
+
+""" + +# Test cases +def test_schema_generation(): + # Test 1: No query (should extract everything) + print("\nTest 1: No Query (Full Schema)") + schema1 = JsonCssExtractionStrategy.generate_schema(test_html) + print(json.dumps(schema1, indent=2)) + + # Test 2: Query for just basic job info + print("\nTest 2: Basic Job Info Query") + query2 = "I only need job titles, salaries, and locations" + schema2 = JsonCssExtractionStrategy.generate_schema(test_html, query2) + print(json.dumps(schema2, indent=2)) + + # Test 3: Query for company and department structure + print("\nTest 3: Organizational Structure Query") + query3 = "Extract company details and department names, without position details" + schema3 = JsonCssExtractionStrategy.generate_schema(test_html, query3) + print(json.dumps(schema3, indent=2)) + + # Test 4: Query for specific skills tracking + print("\nTest 4: Skills Analysis Query") + query4 = "I want to analyze required skills across all positions" + schema4 = JsonCssExtractionStrategy.generate_schema(test_html, query4) + print(json.dumps(schema4, indent=2)) + +if __name__ == "__main__": + test_schema_generation() \ No newline at end of file