diff --git a/README.md b/README.md index c75dd07d..191614f4 100644 --- a/README.md +++ b/README.md @@ -52,6 +52,33 @@ result = crawler.run(url="https://www.nbcnews.com/business") print(result.markdown) ``` +### Speed-First Design πŸš€ + +Perhaps the most important design principle for this library is speed. We need to ensure it can handle many links and resources in parallel as quickly as possible. By combining this speed with fast LLMs like Groq, the results will be truly amazing. + +```python +import time +from crawl4ai.web_crawler import WebCrawler +crawler = WebCrawler() +crawler.warmup() + +start = time.time() +url = r"https://www.nbcnews.com/business" +result = crawler.run( url, word_count_threshold=10, bypass_cache=True) +end = time.time() +print(f"Time taken: {end - start}") +``` + +Let's take a look the calculated time for the above code snippet: + +```bash +[LOG] πŸš€ Crawling done, success: True, time taken: 1.3623387813568115 seconds +[LOG] πŸš€ Content extracted, success: True, time taken: 0.05715131759643555 seconds +[LOG] πŸš€ Extraction, time taken: 0.05750393867492676 seconds. +Time taken: 1.439958095550537 +``` +Fetching the content from the page took 1.3623 seconds, and extracting the content took 0.0575 seconds. πŸš€ + ### Extract Structured Data from Web Pages πŸ“Š Crawl all OpenAI models and their fees from the official page. @@ -60,19 +87,30 @@ Crawl all OpenAI models and their fees from the official page. import os from crawl4ai import WebCrawler from crawl4ai.extraction_strategy import LLMExtractionStrategy +from pydantic import BaseModel, Field + +class OpenAIModelFee(BaseModel): + model_name: str = Field(..., description="Name of the OpenAI model.") + input_fee: str = Field(..., description="Fee for input token for the OpenAI model.") + output_fee: str = Field(..., description="Fee for output token ßfor the OpenAI model.") url = 'https://openai.com/api/pricing/' crawler = WebCrawler() crawler.warmup() result = crawler.run( - url=url, - extraction_strategy=LLMExtractionStrategy( - provider="openai/gpt-4", - api_token=os.getenv('OPENAI_API_KEY'), - instruction="Extract all model names and their fees for input and output tokens." - ), -) + url=url, + word_count_threshold=1, + extraction_strategy= LLMExtractionStrategy( + provider= "openai/gpt-4o", api_token = os.getenv('OPENAI_API_KEY'), + schema=OpenAIModelFee.schema(), + extraction_type="schema", + instruction="""From the crawled content, extract all mentioned model names along with their fees for input and output tokens. + Do not miss any models in the entire content. One extracted model JSON format should look like this: + {"model_name": "GPT-4", "input_fee": "US$10.00 / 1M tokens", "output_fee": "US$30.00 / 1M tokens"}.""" + ), + bypass_cache=True, + ) print(result.extracted_content) ``` @@ -119,3 +157,7 @@ For questions, suggestions, or feedback, feel free to reach out: - Website: [crawl4ai.com](https://crawl4ai.com) Happy Crawling! πŸ•ΈοΈπŸš€ + +## Star History + +[![Star History Chart](https://api.star-history.com/svg?repos=unclecode/crawl4ai&type=Date)](https://star-history.com/#unclecode/crawl4ai&Date) \ No newline at end of file diff --git a/crawl4ai/web_crawler.py b/crawl4ai/web_crawler.py index ad687231..a33663e8 100644 --- a/crawl4ai/web_crawler.py +++ b/crawl4ai/web_crawler.py @@ -47,7 +47,7 @@ class WebCrawler: extraction_strategy= NoExtractionStrategy(), bypass_cache=False, verbose = False, - warmup=True + # warmup=True ) self.ready = True print("[LOG] 🌞 WebCrawler is ready to crawl") @@ -160,7 +160,11 @@ class WebCrawler: if not cached or not html: if user_agent: self.crawler_strategy.update_user_agent(user_agent) + t1 = time.time() html = self.crawler_strategy.crawl(url) + t2 = time.time() + if verbose: + print(f"[LOG] πŸš€ Crawling done for {url}, success: {bool(html)}, time taken: {t2 - t1} seconds") if screenshot: screenshot_data = self.crawler_strategy.take_screenshot() @@ -189,7 +193,8 @@ class WebCrawler: # print(f"[LOG] πŸš€ Crawling done for {url}, success: True, time taken: {time.time() - t1} seconds") t1 = time.time() result = get_content_of_website_optimized(url, html, word_count_threshold, css_selector=css_selector, only_text=kwargs.get("only_text", False)) - print(f"[LOG] πŸš€ Crawling done for {url}, success: True, time taken: {time.time() - t1} seconds") + if verbose: + print(f"[LOG] πŸš€ Content extracted for {url}, success: True, time taken: {time.time() - t1} seconds") if result is None: raise ValueError(f"Failed to extract content from the website: {url}") @@ -201,9 +206,6 @@ class WebCrawler: media = result.get("media", []) links = result.get("links", []) metadata = result.get("metadata", {}) - - if verbose: - print(f"[LOG] πŸš€ Crawling done for {url}, success: True, time taken: {time.time() - t} seconds") if extracted_content is None: if verbose: