chore: update .gitignore and enhance changelog with major feature additions and examples
This commit is contained in:
3
.gitignore
vendored
3
.gitignore
vendored
@@ -209,4 +209,5 @@ git_issues.md
|
||||
.tests/
|
||||
.issues/
|
||||
.docs/
|
||||
.issues/
|
||||
.issues/
|
||||
.gitboss/
|
||||
30
CHANGELOG.md
30
CHANGELOG.md
@@ -1,5 +1,35 @@
|
||||
# Changelog
|
||||
|
||||
|
||||
## Version 0.3.74, Major Changes
|
||||
|
||||
1. **File Download Processing** (Nov 14, 2024)
|
||||
- Added capability for users to specify download folders
|
||||
- Implemented file download tracking in crowd result object
|
||||
- Created new file: `tests/async/test_async_doanloader.py`
|
||||
|
||||
2. **Content Filtering Improvements** (Nov 14, 2024)
|
||||
- Introduced Relevance Content Filter as an improvement over Fit Markdown
|
||||
- Implemented BM25 algorithm for content relevance matching
|
||||
- Added new file: `crawl4ai/content_filter_strategy.py`
|
||||
- Removed deprecated: `crawl4ai/content_cleaning_strategy.py`
|
||||
|
||||
3. **Local File and Raw HTML Support** (Nov 13, 2024)
|
||||
- Added support for processing local files
|
||||
- Implemented raw HTML input handling in AsyncWebCrawler
|
||||
- Enhanced `crawl4ai/async_webcrawler.py` with significant performance improvements
|
||||
|
||||
4. **Browser Management Enhancements** (Nov 12, 2024)
|
||||
- Implemented new async crawler strategy using Playwright
|
||||
- Introduced ManagedBrowser for better browser session handling
|
||||
- Added support for persistent browser sessions
|
||||
- Updated from playwright_stealth to tf-playwright-stealth
|
||||
|
||||
5. **API Server Component**
|
||||
- Added CORS support
|
||||
- Implemented static file serving
|
||||
- Enhanced root redirect functionality
|
||||
|
||||
# [0.3.74] November 14, 2024
|
||||
|
||||
- In this commit, the library is updated to process file downloads. Users can now specify a download folder and trigger the download process via JavaScript or other means, with all files being saved. The list of downloaded files will also be added to the crowd result object.
|
||||
|
||||
195
docs/examples/v0.3.74.overview.py
Normal file
195
docs/examples/v0.3.74.overview.py
Normal file
@@ -0,0 +1,195 @@
|
||||
import asyncio
|
||||
import os
|
||||
from pathlib import Path
|
||||
import aiohttp
|
||||
import json
|
||||
from crawl4ai import AsyncWebCrawler
|
||||
from crawl4ai.content_filter_strategy import BM25ContentFilter
|
||||
|
||||
# 1. File Download Processing Example
|
||||
async def download_example():
|
||||
"""Example of downloading files from Python.org"""
|
||||
# downloads_path = os.path.join(os.getcwd(), "downloads")
|
||||
downloads_path = os.path.join(Path.home(), ".crawl4ai", "downloads")
|
||||
os.makedirs(downloads_path, exist_ok=True)
|
||||
|
||||
print(f"Downloads will be saved to: {downloads_path}")
|
||||
|
||||
async with AsyncWebCrawler(
|
||||
accept_downloads=True,
|
||||
downloads_path=downloads_path,
|
||||
verbose=True
|
||||
) as crawler:
|
||||
result = await crawler.arun(
|
||||
url="https://www.python.org/downloads/",
|
||||
js_code="""
|
||||
// Find and click the first Windows installer link
|
||||
const downloadLink = document.querySelector('a[href$=".exe"]');
|
||||
if (downloadLink) {
|
||||
console.log('Found download link:', downloadLink.href);
|
||||
downloadLink.click();
|
||||
} else {
|
||||
console.log('No .exe download link found');
|
||||
}
|
||||
""",
|
||||
wait_for=5 # Wait 5 seconds to ensure download starts
|
||||
)
|
||||
|
||||
if result.downloaded_files:
|
||||
print("\nDownload successful!")
|
||||
print("Downloaded files:")
|
||||
for file_path in result.downloaded_files:
|
||||
print(f"- {file_path}")
|
||||
print(f" File size: {os.path.getsize(file_path) / (1024*1024):.2f} MB")
|
||||
else:
|
||||
print("\nNo files were downloaded")
|
||||
|
||||
# 2. Content Filtering with BM25 Example
|
||||
async def content_filtering_example():
|
||||
"""Example of using the new BM25 content filtering"""
|
||||
async with AsyncWebCrawler(verbose=True) as crawler:
|
||||
# Create filter with custom query for OpenAI's blog
|
||||
content_filter = BM25ContentFilter(
|
||||
user_query="AI language models research innovation",
|
||||
bm25_threshold=1.0
|
||||
)
|
||||
|
||||
result = await crawler.arun(
|
||||
url="https://openai.com/blog",
|
||||
extraction_strategy=content_filter
|
||||
)
|
||||
|
||||
print(f"Filtered content: {result.extracted_content}")
|
||||
|
||||
# 3. Local File and Raw HTML Processing Example
|
||||
async def local_and_raw_html_example():
|
||||
"""Example of processing local files and raw HTML"""
|
||||
# Create a sample HTML file
|
||||
sample_file = "sample.html"
|
||||
with open(sample_file, "w") as f:
|
||||
f.write("""
|
||||
<html><body>
|
||||
<h1>Test Content</h1>
|
||||
<p>This is a test paragraph.</p>
|
||||
</body></html>
|
||||
""")
|
||||
|
||||
async with AsyncWebCrawler(verbose=True) as crawler:
|
||||
# Process local file
|
||||
local_result = await crawler.arun(
|
||||
url=f"file://{os.path.abspath(sample_file)}"
|
||||
)
|
||||
|
||||
# Process raw HTML
|
||||
raw_html = """
|
||||
<html><body>
|
||||
<h1>Raw HTML Test</h1>
|
||||
<p>This is a test of raw HTML processing.</p>
|
||||
</body></html>
|
||||
"""
|
||||
raw_result = await crawler.arun(
|
||||
url=f"raw:{raw_html}"
|
||||
)
|
||||
|
||||
# Clean up
|
||||
os.remove(sample_file)
|
||||
|
||||
print("Local file content:", local_result.markdown)
|
||||
print("\nRaw HTML content:", raw_result.markdown)
|
||||
|
||||
# 4. Browser Management Example
|
||||
async def browser_management_example():
|
||||
"""Example of using enhanced browser management features"""
|
||||
# Use the specified user directory path
|
||||
user_data_dir = os.path.join(Path.home(), ".crawl4ai", "browser_profile")
|
||||
os.makedirs(user_data_dir, exist_ok=True)
|
||||
|
||||
print(f"Browser profile will be saved to: {user_data_dir}")
|
||||
|
||||
async with AsyncWebCrawler(
|
||||
use_managed_browser=True,
|
||||
user_data_dir=user_data_dir,
|
||||
headless=False,
|
||||
verbose=True
|
||||
) as crawler:
|
||||
# Use GitHub as an example - it's a good test for browser management
|
||||
# because it requires proper browser handling
|
||||
result = await crawler.arun(
|
||||
url="https://github.com/trending",
|
||||
session_id="persistent_session_1",
|
||||
js_code="""
|
||||
// Custom JavaScript to execute on GitHub's trending page
|
||||
const repos = document.querySelectorAll('article.Box-row');
|
||||
const data = Array.from(repos).map(repo => ({
|
||||
name: repo.querySelector('h2')?.textContent?.trim(),
|
||||
description: repo.querySelector('p')?.textContent?.trim(),
|
||||
language: repo.querySelector('[itemprop="programmingLanguage"]')?.textContent?.trim()
|
||||
}));
|
||||
console.log('Trending repositories:', JSON.stringify(data, null, 2));
|
||||
"""
|
||||
)
|
||||
|
||||
print("\nBrowser session result:", result.success)
|
||||
if result.success:
|
||||
print("Page title:", result.metadata.get('title', 'No title found'))
|
||||
|
||||
# 5. API Usage Example
|
||||
async def api_example():
|
||||
"""Example of using the new API endpoints"""
|
||||
async with aiohttp.ClientSession() as session:
|
||||
# Submit crawl job
|
||||
crawl_request = {
|
||||
"urls": ["https://news.ycombinator.com"], # Hacker News as an example
|
||||
"extraction_config": {
|
||||
"type": "json_css",
|
||||
"params": {
|
||||
"selectors": {
|
||||
"titles": ".title a",
|
||||
"scores": ".score",
|
||||
"comments": ".comment-tree"
|
||||
}
|
||||
}
|
||||
},
|
||||
"crawler_params": {
|
||||
"headless": True,
|
||||
"use_managed_browser": True
|
||||
},
|
||||
"screenshot": True,
|
||||
"magic": True
|
||||
}
|
||||
|
||||
async with session.post(
|
||||
"http://localhost:11235/crawl",
|
||||
json=crawl_request
|
||||
) as response:
|
||||
task_data = await response.json()
|
||||
task_id = task_data["task_id"]
|
||||
|
||||
# Check task status
|
||||
async with session.get(
|
||||
f"http://localhost:11235/task/{task_id}"
|
||||
) as status_response:
|
||||
result = await status_response.json()
|
||||
print(f"Task result: {result}")
|
||||
|
||||
# Main execution
|
||||
async def main():
|
||||
print("Running Crawl4AI feature examples...")
|
||||
|
||||
print("\n1. Running Download Example:")
|
||||
await download_example()
|
||||
|
||||
print("\n2. Running Content Filtering Example:")
|
||||
await content_filtering_example()
|
||||
|
||||
print("\n3. Running Local and Raw HTML Example:")
|
||||
await local_and_raw_html_example()
|
||||
|
||||
print("\n4. Running Browser Management Example:")
|
||||
await browser_management_example()
|
||||
|
||||
print("\n5. Running API Example:")
|
||||
await api_example()
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
Reference in New Issue
Block a user