Add Async Version, JsonCss Extrator
This commit is contained in:
90
tests/async/test_content_extraction.py
Normal file
90
tests/async/test_content_extraction.py
Normal file
@@ -0,0 +1,90 @@
|
||||
import os
|
||||
import sys
|
||||
import pytest
|
||||
import asyncio
|
||||
import json
|
||||
|
||||
# Add the parent directory to the Python path
|
||||
parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||
sys.path.append(parent_dir)
|
||||
|
||||
from crawl4ai.async_webcrawler import AsyncWebCrawler
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_extract_markdown():
|
||||
async with AsyncWebCrawler(verbose=True) as crawler:
|
||||
url = "https://www.nbcnews.com/business"
|
||||
result = await crawler.arun(url=url, bypass_cache=True)
|
||||
assert result.success
|
||||
assert result.markdown
|
||||
assert isinstance(result.markdown, str)
|
||||
assert len(result.markdown) > 0
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_extract_cleaned_html():
|
||||
async with AsyncWebCrawler(verbose=True) as crawler:
|
||||
url = "https://www.nbcnews.com/business"
|
||||
result = await crawler.arun(url=url, bypass_cache=True)
|
||||
assert result.success
|
||||
assert result.cleaned_html
|
||||
assert isinstance(result.cleaned_html, str)
|
||||
assert len(result.cleaned_html) > 0
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_extract_media():
|
||||
async with AsyncWebCrawler(verbose=True) as crawler:
|
||||
url = "https://www.nbcnews.com/business"
|
||||
result = await crawler.arun(url=url, bypass_cache=True)
|
||||
assert result.success
|
||||
assert result.media
|
||||
media = result.media
|
||||
assert isinstance(media, dict)
|
||||
assert "images" in media
|
||||
assert isinstance(media["images"], list)
|
||||
for image in media["images"]:
|
||||
assert "src" in image
|
||||
assert "alt" in image
|
||||
assert "type" in image
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_extract_links():
|
||||
async with AsyncWebCrawler(verbose=True) as crawler:
|
||||
url = "https://www.nbcnews.com/business"
|
||||
result = await crawler.arun(url=url, bypass_cache=True)
|
||||
assert result.success
|
||||
assert result.links
|
||||
links = result.links
|
||||
assert isinstance(links, dict)
|
||||
assert "internal" in links
|
||||
assert "external" in links
|
||||
assert isinstance(links["internal"], list)
|
||||
assert isinstance(links["external"], list)
|
||||
for link in links["internal"] + links["external"]:
|
||||
assert "href" in link
|
||||
assert "text" in link
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_extract_metadata():
|
||||
async with AsyncWebCrawler(verbose=True) as crawler:
|
||||
url = "https://www.nbcnews.com/business"
|
||||
result = await crawler.arun(url=url, bypass_cache=True)
|
||||
assert result.success
|
||||
assert result.metadata
|
||||
metadata = result.metadata
|
||||
assert isinstance(metadata, dict)
|
||||
assert "title" in metadata
|
||||
assert isinstance(metadata["title"], str)
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_css_selector_extraction():
|
||||
async with AsyncWebCrawler(verbose=True) as crawler:
|
||||
url = "https://www.nbcnews.com/business"
|
||||
css_selector = "h1, h2, h3"
|
||||
result = await crawler.arun(url=url, bypass_cache=True, css_selector=css_selector)
|
||||
assert result.success
|
||||
assert result.markdown
|
||||
assert all(heading in result.markdown for heading in ["#", "##", "###"])
|
||||
|
||||
# Entry point for debugging
|
||||
if __name__ == "__main__":
|
||||
pytest.main([__file__, "-v"])
|
||||
Reference in New Issue
Block a user