Compare commits
1 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
3c6ebb73ae |
4
.gitignore
vendored
4
.gitignore
vendored
@@ -189,6 +189,4 @@ a.txt
|
|||||||
.lambda_function.py
|
.lambda_function.py
|
||||||
ec2*
|
ec2*
|
||||||
|
|
||||||
update_changelog.sh
|
update_changelog.sh
|
||||||
test_env/
|
|
||||||
tmp/
|
|
||||||
@@ -101,7 +101,7 @@ class LLMExtractionStrategy(ExtractionStrategy):
|
|||||||
variable_values["REQUEST"] = self.instruction
|
variable_values["REQUEST"] = self.instruction
|
||||||
prompt_with_variables = PROMPT_EXTRACT_BLOCKS_WITH_INSTRUCTION
|
prompt_with_variables = PROMPT_EXTRACT_BLOCKS_WITH_INSTRUCTION
|
||||||
|
|
||||||
if self.extract_type == "schema" and self.schema:
|
if self.extract_type == "schema":
|
||||||
variable_values["SCHEMA"] = json.dumps(self.schema, indent=2)
|
variable_values["SCHEMA"] = json.dumps(self.schema, indent=2)
|
||||||
prompt_with_variables = PROMPT_EXTRACT_SCHEMA_WITH_INSTRUCTION
|
prompt_with_variables = PROMPT_EXTRACT_SCHEMA_WITH_INSTRUCTION
|
||||||
|
|
||||||
|
|||||||
@@ -834,6 +834,7 @@ def extract_blocks_batch(batch_data, provider = "groq/llama3-70b-8192", api_toke
|
|||||||
|
|
||||||
return sum(all_blocks, [])
|
return sum(all_blocks, [])
|
||||||
|
|
||||||
|
|
||||||
def merge_chunks_based_on_token_threshold(chunks, token_threshold):
|
def merge_chunks_based_on_token_threshold(chunks, token_threshold):
|
||||||
"""
|
"""
|
||||||
Merges small chunks into larger ones based on the total token threshold.
|
Merges small chunks into larger ones based on the total token threshold.
|
||||||
@@ -879,6 +880,7 @@ def process_sections(url: str, sections: list, provider: str, api_token: str) ->
|
|||||||
|
|
||||||
return extracted_content
|
return extracted_content
|
||||||
|
|
||||||
|
|
||||||
def wrap_text(draw, text, font, max_width):
|
def wrap_text(draw, text, font, max_width):
|
||||||
# Wrap the text to fit within the specified width
|
# Wrap the text to fit within the specified width
|
||||||
lines = []
|
lines = []
|
||||||
@@ -890,6 +892,7 @@ def wrap_text(draw, text, font, max_width):
|
|||||||
lines.append(line)
|
lines.append(line)
|
||||||
return '\n'.join(lines)
|
return '\n'.join(lines)
|
||||||
|
|
||||||
|
|
||||||
def format_html(html_string):
|
def format_html(html_string):
|
||||||
soup = BeautifulSoup(html_string, 'html.parser')
|
soup = BeautifulSoup(html_string, 'html.parser')
|
||||||
return soup.prettify()
|
return soup.prettify()
|
||||||
|
|||||||
@@ -16,40 +16,23 @@ warnings.filterwarnings("ignore", message='Field "model_name" has conflict with
|
|||||||
|
|
||||||
|
|
||||||
class WebCrawler:
|
class WebCrawler:
|
||||||
def __init__(
|
def __init__(self, crawler_strategy: CrawlerStrategy = None, always_by_pass_cache: bool = False, verbose: bool = False):
|
||||||
self,
|
|
||||||
# db_path: str = None,
|
|
||||||
crawler_strategy: CrawlerStrategy = None,
|
|
||||||
always_by_pass_cache: bool = False,
|
|
||||||
verbose: bool = False,
|
|
||||||
):
|
|
||||||
# self.db_path = db_path
|
|
||||||
self.crawler_strategy = crawler_strategy or LocalSeleniumCrawlerStrategy(verbose=verbose)
|
self.crawler_strategy = crawler_strategy or LocalSeleniumCrawlerStrategy(verbose=verbose)
|
||||||
self.always_by_pass_cache = always_by_pass_cache
|
self.always_by_pass_cache = always_by_pass_cache
|
||||||
|
|
||||||
# Create the .crawl4ai folder in the user's home directory if it doesn't exist
|
|
||||||
self.crawl4ai_folder = os.path.join(Path.home(), ".crawl4ai")
|
self.crawl4ai_folder = os.path.join(Path.home(), ".crawl4ai")
|
||||||
os.makedirs(self.crawl4ai_folder, exist_ok=True)
|
os.makedirs(self.crawl4ai_folder, exist_ok=True)
|
||||||
os.makedirs(f"{self.crawl4ai_folder}/cache", exist_ok=True)
|
os.makedirs(f"{self.crawl4ai_folder}/cache", exist_ok=True)
|
||||||
|
|
||||||
# If db_path is not provided, use the default path
|
|
||||||
# if not db_path:
|
|
||||||
# self.db_path = f"{self.crawl4ai_folder}/crawl4ai.db"
|
|
||||||
|
|
||||||
# flush_db()
|
|
||||||
init_db()
|
init_db()
|
||||||
|
|
||||||
self.ready = False
|
self.ready = False
|
||||||
|
|
||||||
def warmup(self):
|
def warmup(self):
|
||||||
print("[LOG] 🌤️ Warming up the WebCrawler")
|
print("[LOG] 🌤️ Warming up the WebCrawler")
|
||||||
result = self.run(
|
self.run(
|
||||||
url='https://google.com/',
|
url='https://google.com/',
|
||||||
word_count_threshold=5,
|
word_count_threshold=5,
|
||||||
extraction_strategy= NoExtractionStrategy(),
|
extraction_strategy=NoExtractionStrategy(),
|
||||||
bypass_cache=False,
|
bypass_cache=False,
|
||||||
verbose = False,
|
verbose=False
|
||||||
# warmup=True
|
|
||||||
)
|
)
|
||||||
self.ready = True
|
self.ready = True
|
||||||
print("[LOG] 🌞 WebCrawler is ready to crawl")
|
print("[LOG] 🌞 WebCrawler is ready to crawl")
|
||||||
@@ -139,12 +122,8 @@ class WebCrawler:
|
|||||||
if not isinstance(chunking_strategy, ChunkingStrategy):
|
if not isinstance(chunking_strategy, ChunkingStrategy):
|
||||||
raise ValueError("Unsupported chunking strategy")
|
raise ValueError("Unsupported chunking strategy")
|
||||||
|
|
||||||
# if word_count_threshold < MIN_WORD_THRESHOLD:
|
|
||||||
# word_count_threshold = MIN_WORD_THRESHOLD
|
|
||||||
|
|
||||||
word_count_threshold = max(word_count_threshold, 0)
|
word_count_threshold = max(word_count_threshold, 0)
|
||||||
|
|
||||||
# Check cache first
|
|
||||||
cached = None
|
cached = None
|
||||||
screenshot_data = None
|
screenshot_data = None
|
||||||
extracted_content = None
|
extracted_content = None
|
||||||
@@ -169,7 +148,7 @@ class WebCrawler:
|
|||||||
html = sanitize_input_encode(self.crawler_strategy.crawl(url, **kwargs))
|
html = sanitize_input_encode(self.crawler_strategy.crawl(url, **kwargs))
|
||||||
t2 = time.time()
|
t2 = time.time()
|
||||||
if verbose:
|
if verbose:
|
||||||
print(f"[LOG] 🚀 Crawling done for {url}, success: {bool(html)}, time taken: {t2 - t1} seconds")
|
print(f"[LOG] 🚀 Crawling done for {url}, success: {bool(html)}, time taken: {t2 - t1:.2f} seconds")
|
||||||
if screenshot:
|
if screenshot:
|
||||||
screenshot_data = self.crawler_strategy.take_screenshot()
|
screenshot_data = self.crawler_strategy.take_screenshot()
|
||||||
|
|
||||||
@@ -200,13 +179,10 @@ class WebCrawler:
|
|||||||
t = time.time()
|
t = time.time()
|
||||||
# Extract content from HTML
|
# Extract content from HTML
|
||||||
try:
|
try:
|
||||||
# t1 = time.time()
|
|
||||||
# result = get_content_of_website(url, html, word_count_threshold, css_selector=css_selector, only_text=kwargs.get("only_text", False))
|
|
||||||
# print(f"[LOG] 🚀 Crawling done for {url}, success: True, time taken: {time.time() - t1} seconds")
|
|
||||||
t1 = time.time()
|
t1 = time.time()
|
||||||
result = get_content_of_website_optimized(url, html, word_count_threshold, css_selector=css_selector, only_text=kwargs.get("only_text", False))
|
result = get_content_of_website_optimized(url, html, word_count_threshold, css_selector=css_selector, only_text=kwargs.get("only_text", False))
|
||||||
if verbose:
|
if verbose:
|
||||||
print(f"[LOG] 🚀 Content extracted for {url}, success: True, time taken: {time.time() - t1} seconds")
|
print(f"[LOG] 🚀 Content extracted for {url}, success: True, time taken: {time.time() - t1:.2f} seconds")
|
||||||
|
|
||||||
if result is None:
|
if result is None:
|
||||||
raise ValueError(f"Failed to extract content from the website: {url}")
|
raise ValueError(f"Failed to extract content from the website: {url}")
|
||||||
@@ -228,7 +204,7 @@ class WebCrawler:
|
|||||||
extracted_content = json.dumps(extracted_content, indent=4, default=str)
|
extracted_content = json.dumps(extracted_content, indent=4, default=str)
|
||||||
|
|
||||||
if verbose:
|
if verbose:
|
||||||
print(f"[LOG] 🚀 Extraction done for {url}, time taken: {time.time() - t} seconds.")
|
print(f"[LOG] 🚀 Extraction done for {url}, time taken: {time.time() - t:.2f} seconds.")
|
||||||
|
|
||||||
screenshot = None if not screenshot else screenshot
|
screenshot = None if not screenshot else screenshot
|
||||||
|
|
||||||
|
|||||||
2
setup.py
2
setup.py
@@ -19,7 +19,7 @@ with open("requirements.txt") as f:
|
|||||||
requirements = f.read().splitlines()
|
requirements = f.read().splitlines()
|
||||||
|
|
||||||
# Define the requirements for different environments
|
# Define the requirements for different environments
|
||||||
default_requirements = [req for req in requirements if not req.startswith(("torch", "transformers", "onnxruntime", "nltk", "spacy", "tokenizers", "scikit-learn"))]
|
default_requirements = [req for req in requirements if not req.startswith(("torch", "transformers", "onnxruntime", "nltk", "spacy", "tokenizers", "scikit-learn", "numpy"))]
|
||||||
torch_requirements = [req for req in requirements if req.startswith(("torch", "nltk", "spacy", "scikit-learn", "numpy"))]
|
torch_requirements = [req for req in requirements if req.startswith(("torch", "nltk", "spacy", "scikit-learn", "numpy"))]
|
||||||
transformer_requirements = [req for req in requirements if req.startswith(("transformers", "tokenizers", "onnxruntime"))]
|
transformer_requirements = [req for req in requirements if req.startswith(("transformers", "tokenizers", "onnxruntime"))]
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user