Apply Ruff Corrections

This commit is contained in:
UncleCode
2025-01-13 19:19:58 +08:00
parent c3370ec5da
commit 8ec12d7d68
84 changed files with 6861 additions and 5076 deletions

View File

@@ -1,4 +1,5 @@
import os, sys
# append the parent directory to the sys.path
parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.append(parent_dir)
@@ -13,19 +14,18 @@ import json
from crawl4ai import AsyncWebCrawler, CacheMode
from crawl4ai.content_filter_strategy import BM25ContentFilter
# 1. File Download Processing Example
async def download_example():
"""Example of downloading files from Python.org"""
# downloads_path = os.path.join(os.getcwd(), "downloads")
downloads_path = os.path.join(Path.home(), ".crawl4ai", "downloads")
os.makedirs(downloads_path, exist_ok=True)
print(f"Downloads will be saved to: {downloads_path}")
async with AsyncWebCrawler(
accept_downloads=True,
downloads_path=downloads_path,
verbose=True
accept_downloads=True, downloads_path=downloads_path, verbose=True
) as crawler:
result = await crawler.arun(
url="https://www.python.org/downloads/",
@@ -40,9 +40,9 @@ async def download_example():
}
""",
delay_before_return_html=1, # Wait 5 seconds to ensure download starts
cache_mode=CacheMode.BYPASS
cache_mode=CacheMode.BYPASS,
)
if result.downloaded_files:
print("\nDownload successful!")
print("Downloaded files:")
@@ -52,25 +52,26 @@ async def download_example():
else:
print("\nNo files were downloaded")
# 2. Local File and Raw HTML Processing Example
async def local_and_raw_html_example():
"""Example of processing local files and raw HTML"""
# Create a sample HTML file
sample_file = os.path.join(__data__, "sample.html")
with open(sample_file, "w") as f:
f.write("""
f.write(
"""
<html><body>
<h1>Test Content</h1>
<p>This is a test paragraph.</p>
</body></html>
""")
"""
)
async with AsyncWebCrawler(verbose=True) as crawler:
# Process local file
local_result = await crawler.arun(
url=f"file://{os.path.abspath(sample_file)}"
)
local_result = await crawler.arun(url=f"file://{os.path.abspath(sample_file)}")
# Process raw HTML
raw_html = """
<html><body>
@@ -78,16 +79,15 @@ async def local_and_raw_html_example():
<p>This is a test of raw HTML processing.</p>
</body></html>
"""
raw_result = await crawler.arun(
url=f"raw:{raw_html}"
)
raw_result = await crawler.arun(url=f"raw:{raw_html}")
# Clean up
os.remove(sample_file)
print("Local file content:", local_result.markdown)
print("\nRaw HTML content:", raw_result.markdown)
# 3. Enhanced Markdown Generation Example
async def markdown_generation_example():
"""Example of enhanced markdown generation with citations and LLM-friendly features"""
@@ -97,58 +97,66 @@ async def markdown_generation_example():
# user_query="History and cultivation",
bm25_threshold=1.0
)
result = await crawler.arun(
url="https://en.wikipedia.org/wiki/Apple",
css_selector="main div#bodyContent",
content_filter=content_filter,
cache_mode=CacheMode.BYPASS
cache_mode=CacheMode.BYPASS,
)
from crawl4ai import AsyncWebCrawler
from crawl4ai.content_filter_strategy import BM25ContentFilter
result = await crawler.arun(
url="https://en.wikipedia.org/wiki/Apple",
css_selector="main div#bodyContent",
content_filter=BM25ContentFilter()
content_filter=BM25ContentFilter(),
)
print(result.markdown_v2.fit_markdown)
print("\nMarkdown Generation Results:")
print(f"1. Original markdown length: {len(result.markdown)}")
print(f"2. New markdown versions (markdown_v2):")
print("2. New markdown versions (markdown_v2):")
print(f" - Raw markdown length: {len(result.markdown_v2.raw_markdown)}")
print(f" - Citations markdown length: {len(result.markdown_v2.markdown_with_citations)}")
print(f" - References section length: {len(result.markdown_v2.references_markdown)}")
print(
f" - Citations markdown length: {len(result.markdown_v2.markdown_with_citations)}"
)
print(
f" - References section length: {len(result.markdown_v2.references_markdown)}"
)
if result.markdown_v2.fit_markdown:
print(f" - Filtered markdown length: {len(result.markdown_v2.fit_markdown)}")
print(
f" - Filtered markdown length: {len(result.markdown_v2.fit_markdown)}"
)
# Save examples to files
output_dir = os.path.join(__data__, "markdown_examples")
os.makedirs(output_dir, exist_ok=True)
# Save different versions
with open(os.path.join(output_dir, "1_raw_markdown.md"), "w") as f:
f.write(result.markdown_v2.raw_markdown)
with open(os.path.join(output_dir, "2_citations_markdown.md"), "w") as f:
f.write(result.markdown_v2.markdown_with_citations)
with open(os.path.join(output_dir, "3_references.md"), "w") as f:
f.write(result.markdown_v2.references_markdown)
if result.markdown_v2.fit_markdown:
with open(os.path.join(output_dir, "4_filtered_markdown.md"), "w") as f:
f.write(result.markdown_v2.fit_markdown)
print(f"\nMarkdown examples saved to: {output_dir}")
# Show a sample of citations and references
print("\nSample of markdown with citations:")
print(result.markdown_v2.markdown_with_citations[:500] + "...\n")
print("Sample of references:")
print('\n'.join(result.markdown_v2.references_markdown.split('\n')[:10]) + "...")
print(
"\n".join(result.markdown_v2.references_markdown.split("\n")[:10]) + "..."
)
# 4. Browser Management Example
async def browser_management_example():
@@ -156,38 +164,38 @@ async def browser_management_example():
# Use the specified user directory path
user_data_dir = os.path.join(Path.home(), ".crawl4ai", "browser_profile")
os.makedirs(user_data_dir, exist_ok=True)
print(f"Browser profile will be saved to: {user_data_dir}")
async with AsyncWebCrawler(
use_managed_browser=True,
user_data_dir=user_data_dir,
headless=False,
verbose=True
verbose=True,
) as crawler:
result = await crawler.arun(
url="https://crawl4ai.com",
# session_id="persistent_session_1",
cache_mode=CacheMode.BYPASS
)
cache_mode=CacheMode.BYPASS,
)
# Use GitHub as an example - it's a good test for browser management
# because it requires proper browser handling
result = await crawler.arun(
url="https://github.com/trending",
# session_id="persistent_session_1",
cache_mode=CacheMode.BYPASS
cache_mode=CacheMode.BYPASS,
)
print("\nBrowser session result:", result.success)
if result.success:
print("Page title:", result.metadata.get('title', 'No title found'))
print("Page title:", result.metadata.get("title", "No title found"))
# 5. API Usage Example
async def api_example():
"""Example of using the new API endpoints"""
api_token = os.getenv('CRAWL4AI_API_TOKEN') or "test_api_code"
headers = {'Authorization': f'Bearer {api_token}'}
api_token = os.getenv("CRAWL4AI_API_TOKEN") or "test_api_code"
headers = {"Authorization": f"Bearer {api_token}"}
async with aiohttp.ClientSession() as session:
# Submit crawl job
crawl_request = {
@@ -199,25 +207,17 @@ async def api_example():
"name": "Hacker News Articles",
"baseSelector": ".athing",
"fields": [
{
"name": "title",
"selector": ".title a",
"type": "text"
},
{
"name": "score",
"selector": ".score",
"type": "text"
},
{"name": "title", "selector": ".title a", "type": "text"},
{"name": "score", "selector": ".score", "type": "text"},
{
"name": "url",
"selector": ".title a",
"type": "attribute",
"attribute": "href"
}
]
"attribute": "href",
},
],
}
}
},
},
"crawler_params": {
"headless": True,
@@ -227,51 +227,50 @@ async def api_example():
# "screenshot": True,
# "magic": True
}
async with session.post(
"http://localhost:11235/crawl",
json=crawl_request,
headers=headers
"http://localhost:11235/crawl", json=crawl_request, headers=headers
) as response:
task_data = await response.json()
task_id = task_data["task_id"]
# Check task status
while True:
async with session.get(
f"http://localhost:11235/task/{task_id}",
headers=headers
f"http://localhost:11235/task/{task_id}", headers=headers
) as status_response:
result = await status_response.json()
print(f"Task status: {result['status']}")
if result["status"] == "completed":
print("Task completed!")
print("Results:")
news = json.loads(result["results"][0]['extracted_content'])
news = json.loads(result["results"][0]["extracted_content"])
print(json.dumps(news[:4], indent=2))
break
else:
await asyncio.sleep(1)
# Main execution
async def main():
# print("Running Crawl4AI feature examples...")
# print("\n1. Running Download Example:")
# await download_example()
# print("\n2. Running Markdown Generation Example:")
# await markdown_generation_example()
# # print("\n3. Running Local and Raw HTML Example:")
# await local_and_raw_html_example()
# # print("\n4. Running Browser Management Example:")
await browser_management_example()
# print("\n5. Running API Example:")
await api_example()
if __name__ == "__main__":
asyncio.run(main())
asyncio.run(main())