Apply Ruff Corrections
This commit is contained in:
@@ -1,31 +1,32 @@
|
||||
import os, time
|
||||
|
||||
# append the path to the root of the project
|
||||
import sys
|
||||
import asyncio
|
||||
sys.path.append(os.path.join(os.path.dirname(__file__), '..', '..'))
|
||||
|
||||
sys.path.append(os.path.join(os.path.dirname(__file__), "..", ".."))
|
||||
from firecrawl import FirecrawlApp
|
||||
from crawl4ai import AsyncWebCrawler
|
||||
__data__ = os.path.join(os.path.dirname(__file__), '..', '..') + '/.data'
|
||||
|
||||
__data__ = os.path.join(os.path.dirname(__file__), "..", "..") + "/.data"
|
||||
|
||||
|
||||
async def compare():
|
||||
app = FirecrawlApp(api_key=os.environ['FIRECRAWL_API_KEY'])
|
||||
app = FirecrawlApp(api_key=os.environ["FIRECRAWL_API_KEY"])
|
||||
|
||||
# Tet Firecrawl with a simple crawl
|
||||
start = time.time()
|
||||
scrape_status = app.scrape_url(
|
||||
'https://www.nbcnews.com/business',
|
||||
params={'formats': ['markdown', 'html']}
|
||||
"https://www.nbcnews.com/business", params={"formats": ["markdown", "html"]}
|
||||
)
|
||||
end = time.time()
|
||||
print(f"Time taken: {end - start} seconds")
|
||||
print(len(scrape_status['markdown']))
|
||||
print(len(scrape_status["markdown"]))
|
||||
# save the markdown content with provider name
|
||||
with open(f"{__data__}/firecrawl_simple.md", "w") as f:
|
||||
f.write(scrape_status['markdown'])
|
||||
f.write(scrape_status["markdown"])
|
||||
# Count how many "cldnry.s-nbcnews.com" are in the markdown
|
||||
print(scrape_status['markdown'].count("cldnry.s-nbcnews.com"))
|
||||
|
||||
|
||||
print(scrape_status["markdown"].count("cldnry.s-nbcnews.com"))
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
start = time.time()
|
||||
@@ -33,13 +34,13 @@ async def compare():
|
||||
url="https://www.nbcnews.com/business",
|
||||
# js_code=["const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();"],
|
||||
word_count_threshold=0,
|
||||
bypass_cache=True,
|
||||
verbose=False
|
||||
bypass_cache=True,
|
||||
verbose=False,
|
||||
)
|
||||
end = time.time()
|
||||
print(f"Time taken: {end - start} seconds")
|
||||
print(len(result.markdown))
|
||||
# save the markdown content with provider name
|
||||
# save the markdown content with provider name
|
||||
with open(f"{__data__}/crawl4ai_simple.md", "w") as f:
|
||||
f.write(result.markdown)
|
||||
# count how many "cldnry.s-nbcnews.com" are in the markdown
|
||||
@@ -48,10 +49,12 @@ async def compare():
|
||||
start = time.time()
|
||||
result = await crawler.arun(
|
||||
url="https://www.nbcnews.com/business",
|
||||
js_code=["const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();"],
|
||||
js_code=[
|
||||
"const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();"
|
||||
],
|
||||
word_count_threshold=0,
|
||||
bypass_cache=True,
|
||||
verbose=False
|
||||
bypass_cache=True,
|
||||
verbose=False,
|
||||
)
|
||||
end = time.time()
|
||||
print(f"Time taken: {end - start} seconds")
|
||||
@@ -61,7 +64,7 @@ async def compare():
|
||||
f.write(result.markdown)
|
||||
# count how many "cldnry.s-nbcnews.com" are in the markdown
|
||||
print(result.markdown.count("cldnry.s-nbcnews.com"))
|
||||
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(compare())
|
||||
|
||||
Reference in New Issue
Block a user