- Test all methods

- Update index.hml
- Update Readme
- Resolve some bugs
This commit is contained in:
unclecode
2024-05-14 21:27:41 +08:00
parent 5fea6c064b
commit f6e59157bf
17 changed files with 1004 additions and 402 deletions

View File

@@ -2,7 +2,7 @@ import os, time
from pathlib import Path
from .models import UrlModel, CrawlResult
from .database import init_db, get_cached_url, cache_url
from .database import init_db, get_cached_url, cache_url, DB_PATH
from .utils import *
from .chunking_strategy import *
from .extraction_strategy import *
@@ -10,6 +10,7 @@ from .crawler_strategy import *
from typing import List
from concurrent.futures import ThreadPoolExecutor
from .config import *
# from .model_loader import load_bert_base_uncased, load_bge_small_en_v1_5, load_spacy_model
class WebCrawler:
@@ -36,11 +37,11 @@ class WebCrawler:
def warmup(self):
print("[LOG] 🌤️ Warming up the WebCrawler")
single_url = UrlModel(url='https://crawl4ai.uccode.io/', forced=False)
result = self.run(
single_url,
url='https://crawl4ai.uccode.io/',
word_count_threshold=5,
extraction_strategy= CosinegStrategy(),
extraction_strategy= CosineStrategy(),
bypass_cache=False,
verbose = False
)
self.ready = True
@@ -60,10 +61,11 @@ class WebCrawler:
**kwargs,
) -> CrawlResult:
return self.run(
url_model,
url_model.url,
word_count_threshold,
extraction_strategy,
chunking_strategy,
bypass_cache=url_model.forced,
**kwargs,
)
pass
@@ -71,77 +73,85 @@ class WebCrawler:
def run(
self,
url_model: UrlModel,
url: str,
word_count_threshold=MIN_WORD_THRESHOLD,
extraction_strategy: ExtractionStrategy = NoExtractionStrategy(),
chunking_strategy: ChunkingStrategy = RegexChunking(),
bypass_cache: bool = False,
css_selector: str = None,
verbose=True,
**kwargs,
) -> CrawlResult:
# Check if extraction strategy is an instance of ExtractionStrategy if not raise an error
if not isinstance(extraction_strategy, ExtractionStrategy):
raise ValueError("Unsupported extraction strategy")
if not isinstance(chunking_strategy, ChunkingStrategy):
raise ValueError("Unsupported chunking strategy")
# make sure word_count_threshold is not lesser than MIN_WORD_THRESHOLD
if word_count_threshold < MIN_WORD_THRESHOLD:
word_count_threshold = MIN_WORD_THRESHOLD
# Check cache first
cached = get_cached_url(self.db_path, str(url_model.url))
if cached and not url_model.forced:
return CrawlResult(
**{
"url": cached[0],
"html": cached[1],
"cleaned_html": cached[2],
"markdown": cached[3],
"parsed_json": cached[4],
"success": cached[5],
"error_message": "",
}
)
if not bypass_cache:
cached = get_cached_url(url)
if cached:
return CrawlResult(
**{
"url": cached[0],
"html": cached[1],
"cleaned_html": cached[2],
"markdown": cached[3],
"parsed_json": cached[4],
"success": cached[5],
"error_message": "",
}
)
# Initialize WebDriver for crawling
t = time.time()
try:
html = self.crawler_strategy.crawl(str(url_model.url))
success = True
error_message = ""
except Exception as e:
html = ""
success = False
error_message = str(e)
html = self.crawler_strategy.crawl(url)
success = True
error_message = ""
# Extract content from HTML
result = get_content_of_website(html, word_count_threshold)
try:
result = get_content_of_website(html, word_count_threshold, css_selector=css_selector)
if result is None:
raise ValueError(f"Failed to extract content from the website: {url}")
except InvalidCSSSelectorError as e:
raise ValueError(str(e))
cleaned_html = result.get("cleaned_html", html)
markdown = result.get("markdown", "")
# Print a profession LOG style message, show time taken and say crawling is done
if verbose:
print(
f"[LOG] 🚀 Crawling done for {url_model.url}, success: {success}, time taken: {time.time() - t} seconds"
f"[LOG] 🚀 Crawling done for {url}, success: {success}, time taken: {time.time() - t} seconds"
)
parsed_json = []
if verbose:
print(f"[LOG] 🔥 Extracting semantic blocks for {url_model.url}")
print(f"[LOG] 🔥 Extracting semantic blocks for {url}, Strategy: {extraction_strategy.name}")
t = time.time()
# Split markdown into sections
sections = chunking_strategy.chunk(markdown)
# sections = merge_chunks_based_on_token_threshold(sections, CHUNK_TOKEN_THRESHOLD)
parsed_json = extraction_strategy.run(
str(url_model.url), sections,
url, sections,
)
parsed_json = json.dumps(parsed_json)
if verbose:
print(
f"[LOG] 🚀 Extraction done for {url_model.url}, time taken: {time.time() - t} seconds."
f"[LOG] 🚀 Extraction done for {url}, time taken: {time.time() - t} seconds."
)
# Cache the result
cleaned_html = beautify_html(cleaned_html)
cache_url(
self.db_path,
str(url_model.url),
url,
html,
cleaned_html,
markdown,
@@ -150,7 +160,7 @@ class WebCrawler:
)
return CrawlResult(
url=str(url_model.url),
url=url,
html=html,
cleaned_html=cleaned_html,
markdown=markdown,