feat(crawler): add network request and console message capturing
Implement comprehensive network request and console message capturing functionality: - Add capture_network_requests and capture_console_messages config parameters - Add network_requests and console_messages fields to models - Implement Playwright event listeners to capture requests, responses, and console output - Create detailed documentation and examples - Add comprehensive tests This feature enables deep visibility into web page activity for debugging, security analysis, performance profiling, and API discovery in web applications.
This commit is contained in:
112
tests/general/test_schema_builder.py
Normal file
112
tests/general/test_schema_builder.py
Normal file
@@ -0,0 +1,112 @@
|
||||
# https://claude.ai/chat/c4bbe93d-fb54-44ce-92af-76b4c8086c6b
|
||||
# https://claude.ai/chat/c24a768c-d8b2-478a-acc7-d76d42a308da
|
||||
import os, sys
|
||||
|
||||
parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||
sys.path.append(parent_dir)
|
||||
__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
|
||||
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
|
||||
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
|
||||
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy, JsonXPathExtractionStrategy
|
||||
from crawl4ai.utils import preprocess_html_for_schema, JsonXPathExtractionStrategy
|
||||
import json
|
||||
|
||||
# Test HTML - A complex job board with companies, departments, and positions
|
||||
test_html = """
|
||||
<div class="company-listings">
|
||||
<div class="company" data-company-id="123">
|
||||
<div class="company-header">
|
||||
<img class="company-logo" src="google.png" alt="Google">
|
||||
<h1 class="company-name">Google</h1>
|
||||
<div class="company-meta">
|
||||
<span class="company-size">10,000+ employees</span>
|
||||
<span class="company-industry">Technology</span>
|
||||
<a href="https://google.careers" class="careers-link">Careers Page</a>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="departments">
|
||||
<div class="department">
|
||||
<h2 class="department-name">Engineering</h2>
|
||||
<div class="positions">
|
||||
<div class="position-card" data-position-id="eng-1">
|
||||
<h3 class="position-title">Senior Software Engineer</h3>
|
||||
<span class="salary-range">$150,000 - $250,000</span>
|
||||
<div class="position-meta">
|
||||
<span class="location">Mountain View, CA</span>
|
||||
<span class="job-type">Full-time</span>
|
||||
<span class="experience">5+ years</span>
|
||||
</div>
|
||||
<div class="skills-required">
|
||||
<span class="skill">Python</span>
|
||||
<span class="skill">Kubernetes</span>
|
||||
<span class="skill">Machine Learning</span>
|
||||
</div>
|
||||
<p class="position-description">Join our core engineering team...</p>
|
||||
<div class="application-info">
|
||||
<span class="posting-date">Posted: 2024-03-15</span>
|
||||
<button class="apply-btn" data-req-id="REQ12345">Apply Now</button>
|
||||
</div>
|
||||
</div>
|
||||
<!-- More positions -->
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="department">
|
||||
<h2 class="department-name">Marketing</h2>
|
||||
<div class="positions">
|
||||
<div class="position-card" data-position-id="mkt-1">
|
||||
<h3 class="position-title">Growth Marketing Manager</h3>
|
||||
<span class="salary-range">$120,000 - $180,000</span>
|
||||
<div class="position-meta">
|
||||
<span class="location">New York, NY</span>
|
||||
<span class="job-type">Full-time</span>
|
||||
<span class="experience">3+ years</span>
|
||||
</div>
|
||||
<div class="skills-required">
|
||||
<span class="skill">SEO</span>
|
||||
<span class="skill">Analytics</span>
|
||||
<span class="skill">Content Strategy</span>
|
||||
</div>
|
||||
<p class="position-description">Drive our growth initiatives...</p>
|
||||
<div class="application-info">
|
||||
<span class="posting-date">Posted: 2024-03-14</span>
|
||||
<button class="apply-btn" data-req-id="REQ12346">Apply Now</button>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
"""
|
||||
|
||||
# Test cases
|
||||
def test_schema_generation():
|
||||
# Test 1: No query (should extract everything)
|
||||
print("\nTest 1: No Query (Full Schema)")
|
||||
schema1 = JsonCssExtractionStrategy.generate_schema(test_html)
|
||||
print(json.dumps(schema1, indent=2))
|
||||
|
||||
# Test 2: Query for just basic job info
|
||||
print("\nTest 2: Basic Job Info Query")
|
||||
query2 = "I only need job titles, salaries, and locations"
|
||||
schema2 = JsonCssExtractionStrategy.generate_schema(test_html, query2)
|
||||
print(json.dumps(schema2, indent=2))
|
||||
|
||||
# Test 3: Query for company and department structure
|
||||
print("\nTest 3: Organizational Structure Query")
|
||||
query3 = "Extract company details and department names, without position details"
|
||||
schema3 = JsonCssExtractionStrategy.generate_schema(test_html, query3)
|
||||
print(json.dumps(schema3, indent=2))
|
||||
|
||||
# Test 4: Query for specific skills tracking
|
||||
print("\nTest 4: Skills Analysis Query")
|
||||
query4 = "I want to analyze required skills across all positions"
|
||||
schema4 = JsonCssExtractionStrategy.generate_schema(test_html, query4)
|
||||
print(json.dumps(schema4, indent=2))
|
||||
|
||||
if __name__ == "__main__":
|
||||
test_schema_generation()
|
||||
Reference in New Issue
Block a user