Update README
This commit is contained in:
31
README.md
31
README.md
@@ -523,8 +523,33 @@ async def test_news_crawl():
|
|||||||
|
|
||||||
- **📊 Table-to-DataFrame Extraction**: Extract HTML tables directly to CSV or pandas DataFrames:
|
- **📊 Table-to-DataFrame Extraction**: Extract HTML tables directly to CSV or pandas DataFrames:
|
||||||
```python
|
```python
|
||||||
crawler_config = CrawlerRunConfig(extract_tables=True)
|
crawler = AsyncWebCrawler(config=browser_config)
|
||||||
# Access tables via result.tables or result.tables_as_dataframe
|
await crawler.start()
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Set up scraping parameters
|
||||||
|
crawl_config = CrawlerRunConfig(
|
||||||
|
table_score_threshold=8, # Strict table detection
|
||||||
|
)
|
||||||
|
|
||||||
|
# Execute market data extraction
|
||||||
|
results: List[CrawlResult] = await crawler.arun(
|
||||||
|
url="https://coinmarketcap.com/?page=1", config=crawl_config
|
||||||
|
)
|
||||||
|
|
||||||
|
# Process results
|
||||||
|
raw_df = pd.DataFrame()
|
||||||
|
for result in results:
|
||||||
|
if result.success and result.media["tables"]:
|
||||||
|
raw_df = pd.DataFrame(
|
||||||
|
result.media["tables"][0]["rows"],
|
||||||
|
columns=result.media["tables"][0]["headers"],
|
||||||
|
)
|
||||||
|
break
|
||||||
|
print(raw_df.head())
|
||||||
|
|
||||||
|
finally:
|
||||||
|
await crawler.stop()
|
||||||
```
|
```
|
||||||
|
|
||||||
- **🚀 Browser Pooling**: Pages launch hot with pre-warmed browser instances for lower latency and memory usage
|
- **🚀 Browser Pooling**: Pages launch hot with pre-warmed browser instances for lower latency and memory usage
|
||||||
@@ -544,7 +569,7 @@ async def test_news_crawl():
|
|||||||
claude mcp add --transport sse c4ai-sse http://localhost:11235/mcp/sse
|
claude mcp add --transport sse c4ai-sse http://localhost:11235/mcp/sse
|
||||||
```
|
```
|
||||||
|
|
||||||
- **🖥️ Interactive Playground**: Test configurations and generate API requests with the built-in web interface at `/playground`
|
- **🖥️ Interactive Playground**: Test configurations and generate API requests with the built-in web interface at `http://localhost:11235//playground`
|
||||||
|
|
||||||
- **🐳 Revamped Docker Deployment**: Streamlined multi-architecture Docker image with improved resource efficiency
|
- **🐳 Revamped Docker Deployment**: Streamlined multi-architecture Docker image with improved resource efficiency
|
||||||
|
|
||||||
|
|||||||
@@ -383,29 +383,29 @@ async def main():
|
|||||||
scroll_delay=0.2,
|
scroll_delay=0.2,
|
||||||
)
|
)
|
||||||
|
|
||||||
# # Execute market data extraction
|
# Execute market data extraction
|
||||||
# results: List[CrawlResult] = await crawler.arun(
|
results: List[CrawlResult] = await crawler.arun(
|
||||||
# url="https://coinmarketcap.com/?page=1", config=crawl_config
|
url="https://coinmarketcap.com/?page=1", config=crawl_config
|
||||||
# )
|
)
|
||||||
|
|
||||||
# # Process results
|
# Process results
|
||||||
# raw_df = pd.DataFrame()
|
raw_df = pd.DataFrame()
|
||||||
# for result in results:
|
for result in results:
|
||||||
# if result.success and result.media["tables"]:
|
if result.success and result.media["tables"]:
|
||||||
# # Extract primary market table
|
# Extract primary market table
|
||||||
# # DataFrame
|
# DataFrame
|
||||||
# raw_df = pd.DataFrame(
|
raw_df = pd.DataFrame(
|
||||||
# result.media["tables"][0]["rows"],
|
result.media["tables"][0]["rows"],
|
||||||
# columns=result.media["tables"][0]["headers"],
|
columns=result.media["tables"][0]["headers"],
|
||||||
# )
|
)
|
||||||
# break
|
break
|
||||||
|
|
||||||
|
|
||||||
# This is for debugging only
|
# This is for debugging only
|
||||||
# ////// Remove this in production from here..
|
# ////// Remove this in production from here..
|
||||||
# Save raw data for debugging
|
# Save raw data for debugging
|
||||||
# raw_df.to_csv(f"{__current_dir__}/tmp/raw_crypto_data.csv", index=False)
|
raw_df.to_csv(f"{__current_dir__}/tmp/raw_crypto_data.csv", index=False)
|
||||||
# print("🔍 Raw data saved to 'raw_crypto_data.csv'")
|
print("🔍 Raw data saved to 'raw_crypto_data.csv'")
|
||||||
|
|
||||||
# Read from file for debugging
|
# Read from file for debugging
|
||||||
raw_df = pd.read_csv(f"{__current_dir__}/tmp/raw_crypto_data.csv")
|
raw_df = pd.read_csv(f"{__current_dir__}/tmp/raw_crypto_data.csv")
|
||||||
|
|||||||
Reference in New Issue
Block a user