Compare commits

..

1 Commits

Author SHA1 Message Date
Umut CAN
3c6ebb73ae Update web_crawler.py
Improve code efficiency, readability, and maintainability in web_crawler.py
2024-08-30 15:30:06 +03:00
5 changed files with 13 additions and 36 deletions

4
.gitignore vendored
View File

@@ -189,6 +189,4 @@ a.txt
.lambda_function.py .lambda_function.py
ec2* ec2*
update_changelog.sh update_changelog.sh
test_env/
tmp/

View File

@@ -101,7 +101,7 @@ class LLMExtractionStrategy(ExtractionStrategy):
variable_values["REQUEST"] = self.instruction variable_values["REQUEST"] = self.instruction
prompt_with_variables = PROMPT_EXTRACT_BLOCKS_WITH_INSTRUCTION prompt_with_variables = PROMPT_EXTRACT_BLOCKS_WITH_INSTRUCTION
if self.extract_type == "schema" and self.schema: if self.extract_type == "schema":
variable_values["SCHEMA"] = json.dumps(self.schema, indent=2) variable_values["SCHEMA"] = json.dumps(self.schema, indent=2)
prompt_with_variables = PROMPT_EXTRACT_SCHEMA_WITH_INSTRUCTION prompt_with_variables = PROMPT_EXTRACT_SCHEMA_WITH_INSTRUCTION

View File

@@ -834,6 +834,7 @@ def extract_blocks_batch(batch_data, provider = "groq/llama3-70b-8192", api_toke
return sum(all_blocks, []) return sum(all_blocks, [])
def merge_chunks_based_on_token_threshold(chunks, token_threshold): def merge_chunks_based_on_token_threshold(chunks, token_threshold):
""" """
Merges small chunks into larger ones based on the total token threshold. Merges small chunks into larger ones based on the total token threshold.
@@ -879,6 +880,7 @@ def process_sections(url: str, sections: list, provider: str, api_token: str) ->
return extracted_content return extracted_content
def wrap_text(draw, text, font, max_width): def wrap_text(draw, text, font, max_width):
# Wrap the text to fit within the specified width # Wrap the text to fit within the specified width
lines = [] lines = []
@@ -890,6 +892,7 @@ def wrap_text(draw, text, font, max_width):
lines.append(line) lines.append(line)
return '\n'.join(lines) return '\n'.join(lines)
def format_html(html_string): def format_html(html_string):
soup = BeautifulSoup(html_string, 'html.parser') soup = BeautifulSoup(html_string, 'html.parser')
return soup.prettify() return soup.prettify()

View File

@@ -16,40 +16,23 @@ warnings.filterwarnings("ignore", message='Field "model_name" has conflict with
class WebCrawler: class WebCrawler:
def __init__( def __init__(self, crawler_strategy: CrawlerStrategy = None, always_by_pass_cache: bool = False, verbose: bool = False):
self,
# db_path: str = None,
crawler_strategy: CrawlerStrategy = None,
always_by_pass_cache: bool = False,
verbose: bool = False,
):
# self.db_path = db_path
self.crawler_strategy = crawler_strategy or LocalSeleniumCrawlerStrategy(verbose=verbose) self.crawler_strategy = crawler_strategy or LocalSeleniumCrawlerStrategy(verbose=verbose)
self.always_by_pass_cache = always_by_pass_cache self.always_by_pass_cache = always_by_pass_cache
# Create the .crawl4ai folder in the user's home directory if it doesn't exist
self.crawl4ai_folder = os.path.join(Path.home(), ".crawl4ai") self.crawl4ai_folder = os.path.join(Path.home(), ".crawl4ai")
os.makedirs(self.crawl4ai_folder, exist_ok=True) os.makedirs(self.crawl4ai_folder, exist_ok=True)
os.makedirs(f"{self.crawl4ai_folder}/cache", exist_ok=True) os.makedirs(f"{self.crawl4ai_folder}/cache", exist_ok=True)
# If db_path is not provided, use the default path
# if not db_path:
# self.db_path = f"{self.crawl4ai_folder}/crawl4ai.db"
# flush_db()
init_db() init_db()
self.ready = False self.ready = False
def warmup(self): def warmup(self):
print("[LOG] 🌤️ Warming up the WebCrawler") print("[LOG] 🌤️ Warming up the WebCrawler")
result = self.run( self.run(
url='https://google.com/', url='https://google.com/',
word_count_threshold=5, word_count_threshold=5,
extraction_strategy= NoExtractionStrategy(), extraction_strategy=NoExtractionStrategy(),
bypass_cache=False, bypass_cache=False,
verbose = False, verbose=False
# warmup=True
) )
self.ready = True self.ready = True
print("[LOG] 🌞 WebCrawler is ready to crawl") print("[LOG] 🌞 WebCrawler is ready to crawl")
@@ -139,12 +122,8 @@ class WebCrawler:
if not isinstance(chunking_strategy, ChunkingStrategy): if not isinstance(chunking_strategy, ChunkingStrategy):
raise ValueError("Unsupported chunking strategy") raise ValueError("Unsupported chunking strategy")
# if word_count_threshold < MIN_WORD_THRESHOLD:
# word_count_threshold = MIN_WORD_THRESHOLD
word_count_threshold = max(word_count_threshold, 0) word_count_threshold = max(word_count_threshold, 0)
# Check cache first
cached = None cached = None
screenshot_data = None screenshot_data = None
extracted_content = None extracted_content = None
@@ -169,7 +148,7 @@ class WebCrawler:
html = sanitize_input_encode(self.crawler_strategy.crawl(url, **kwargs)) html = sanitize_input_encode(self.crawler_strategy.crawl(url, **kwargs))
t2 = time.time() t2 = time.time()
if verbose: if verbose:
print(f"[LOG] 🚀 Crawling done for {url}, success: {bool(html)}, time taken: {t2 - t1} seconds") print(f"[LOG] 🚀 Crawling done for {url}, success: {bool(html)}, time taken: {t2 - t1:.2f} seconds")
if screenshot: if screenshot:
screenshot_data = self.crawler_strategy.take_screenshot() screenshot_data = self.crawler_strategy.take_screenshot()
@@ -200,13 +179,10 @@ class WebCrawler:
t = time.time() t = time.time()
# Extract content from HTML # Extract content from HTML
try: try:
# t1 = time.time()
# result = get_content_of_website(url, html, word_count_threshold, css_selector=css_selector, only_text=kwargs.get("only_text", False))
# print(f"[LOG] 🚀 Crawling done for {url}, success: True, time taken: {time.time() - t1} seconds")
t1 = time.time() t1 = time.time()
result = get_content_of_website_optimized(url, html, word_count_threshold, css_selector=css_selector, only_text=kwargs.get("only_text", False)) result = get_content_of_website_optimized(url, html, word_count_threshold, css_selector=css_selector, only_text=kwargs.get("only_text", False))
if verbose: if verbose:
print(f"[LOG] 🚀 Content extracted for {url}, success: True, time taken: {time.time() - t1} seconds") print(f"[LOG] 🚀 Content extracted for {url}, success: True, time taken: {time.time() - t1:.2f} seconds")
if result is None: if result is None:
raise ValueError(f"Failed to extract content from the website: {url}") raise ValueError(f"Failed to extract content from the website: {url}")
@@ -228,7 +204,7 @@ class WebCrawler:
extracted_content = json.dumps(extracted_content, indent=4, default=str) extracted_content = json.dumps(extracted_content, indent=4, default=str)
if verbose: if verbose:
print(f"[LOG] 🚀 Extraction done for {url}, time taken: {time.time() - t} seconds.") print(f"[LOG] 🚀 Extraction done for {url}, time taken: {time.time() - t:.2f} seconds.")
screenshot = None if not screenshot else screenshot screenshot = None if not screenshot else screenshot

View File

@@ -19,7 +19,7 @@ with open("requirements.txt") as f:
requirements = f.read().splitlines() requirements = f.read().splitlines()
# Define the requirements for different environments # Define the requirements for different environments
default_requirements = [req for req in requirements if not req.startswith(("torch", "transformers", "onnxruntime", "nltk", "spacy", "tokenizers", "scikit-learn"))] default_requirements = [req for req in requirements if not req.startswith(("torch", "transformers", "onnxruntime", "nltk", "spacy", "tokenizers", "scikit-learn", "numpy"))]
torch_requirements = [req for req in requirements if req.startswith(("torch", "nltk", "spacy", "scikit-learn", "numpy"))] torch_requirements = [req for req in requirements if req.startswith(("torch", "nltk", "spacy", "scikit-learn", "numpy"))]
transformer_requirements = [req for req in requirements if req.startswith(("transformers", "tokenizers", "onnxruntime"))] transformer_requirements = [req for req in requirements if req.startswith(("transformers", "tokenizers", "onnxruntime"))]