This commit is contained in:
Unclecode
2024-06-08 08:53:54 +00:00
6 changed files with 151 additions and 14 deletions

View File

@@ -20,6 +20,7 @@ def init_db():
extracted_content TEXT, extracted_content TEXT,
success BOOLEAN, success BOOLEAN,
media TEXT DEFAULT "{}", media TEXT DEFAULT "{}",
link TEXT DEFAULT "{}",
screenshot TEXT DEFAULT "" screenshot TEXT DEFAULT ""
) )
''') ''')
@@ -41,12 +42,12 @@ def check_db_path():
if not DB_PATH: if not DB_PATH:
raise ValueError("Database path is not set or is empty.") raise ValueError("Database path is not set or is empty.")
def get_cached_url(url: str) -> Optional[Tuple[str, str, str, str, str, bool, str]]: def get_cached_url(url: str) -> Optional[Tuple[str, str, str, str, str, str, bool, str]]:
check_db_path() check_db_path()
try: try:
conn = sqlite3.connect(DB_PATH) conn = sqlite3.connect(DB_PATH)
cursor = conn.cursor() cursor = conn.cursor()
cursor.execute('SELECT url, html, cleaned_html, markdown, extracted_content, success, media, screenshot FROM crawled_data WHERE url = ?', (url,)) cursor.execute('SELECT url, html, cleaned_html, markdown, extracted_content, success, media, links, screenshot FROM crawled_data WHERE url = ?', (url,))
result = cursor.fetchone() result = cursor.fetchone()
conn.close() conn.close()
return result return result
@@ -54,14 +55,14 @@ def get_cached_url(url: str) -> Optional[Tuple[str, str, str, str, str, bool, st
print(f"Error retrieving cached URL: {e}") print(f"Error retrieving cached URL: {e}")
return None return None
def cache_url(url: str, html: str, cleaned_html: str, markdown: str, extracted_content: str, success: bool, media : str = "{}", screenshot: str = ""): def cache_url(url: str, html: str, cleaned_html: str, markdown: str, extracted_content: str, success: bool, media : str = "{}", links : str = "{}", screenshot: str = ""):
check_db_path() check_db_path()
try: try:
conn = sqlite3.connect(DB_PATH) conn = sqlite3.connect(DB_PATH)
cursor = conn.cursor() cursor = conn.cursor()
cursor.execute(''' cursor.execute('''
INSERT INTO crawled_data (url, html, cleaned_html, markdown, extracted_content, success, screenshot) INSERT INTO crawled_data (url, html, cleaned_html, markdown, extracted_content, success, media, links, screenshot)
VALUES (?, ?, ?, ?, ?, ?, ?) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
ON CONFLICT(url) DO UPDATE SET ON CONFLICT(url) DO UPDATE SET
html = excluded.html, html = excluded.html,
cleaned_html = excluded.cleaned_html, cleaned_html = excluded.cleaned_html,
@@ -69,8 +70,9 @@ def cache_url(url: str, html: str, cleaned_html: str, markdown: str, extracted_c
extracted_content = excluded.extracted_content, extracted_content = excluded.extracted_content,
success = excluded.success, success = excluded.success,
media = excluded.media, media = excluded.media,
links = excluded.links,
screenshot = excluded.screenshot screenshot = excluded.screenshot
''', (url, html, cleaned_html, markdown, extracted_content, success, media, screenshot)) ''', (url, html, cleaned_html, markdown, extracted_content, success, media, links, screenshot))
conn.commit() conn.commit()
conn.close() conn.close()
except Exception as e: except Exception as e:
@@ -124,5 +126,5 @@ def update_existing_records(new_column: str = "media", default_value: str = "{}"
if __name__ == "__main__": if __name__ == "__main__":
init_db() # Initialize the database if not already initialized init_db() # Initialize the database if not already initialized
alter_db_add_screenshot() # Add the new column to the table alter_db_add_screenshot("links") # Add the new column to the table
update_existing_records() # Update existing records to set the new column to an empty string update_existing_records("links") # Update existing records to set the new column to an empty string

View File

@@ -11,6 +11,7 @@ class CrawlResult(BaseModel):
success: bool success: bool
cleaned_html: Optional[str] = None cleaned_html: Optional[str] = None
media: Dict[str, List[Dict]] = {} media: Dict[str, List[Dict]] = {}
links: Dict[str, List[Dict]] = {}
screenshot: Optional[str] = None screenshot: Optional[str] = None
markdown: Optional[str] = None markdown: Optional[str] = None
extracted_content: Optional[str] = None extracted_content: Optional[str] = None

View File

@@ -151,7 +151,7 @@ class CustomHTML2Text(HTML2Text):
super().handle_tag(tag, attrs, start) super().handle_tag(tag, attrs, start)
def get_content_of_website(html, word_count_threshold = MIN_WORD_THRESHOLD, css_selector = None): def get_content_of_website(url, html, word_count_threshold = MIN_WORD_THRESHOLD, css_selector = None):
try: try:
if not html: if not html:
return None return None
@@ -171,6 +171,28 @@ def get_content_of_website(html, word_count_threshold = MIN_WORD_THRESHOLD, css_
div_tag.append(el) div_tag.append(el)
body = div_tag body = div_tag
links = {
'internal': [],
'external': []
}
# Extract all internal and external links
for a in body.find_all('a', href=True):
href = a['href']
url_base = url.split('/')[2]
if href.startswith('http') and url_base not in href:
links['external'].append({
'href': href,
'text': a.get_text()
})
else:
links['internal'].append(
{
'href': href,
'text': a.get_text()
}
)
# Remove script, style, and other tags that don't carry useful content from body # Remove script, style, and other tags that don't carry useful content from body
for tag in body.find_all(['script', 'style', 'link', 'meta', 'noscript']): for tag in body.find_all(['script', 'style', 'link', 'meta', 'noscript']):
tag.decompose() tag.decompose()
@@ -329,7 +351,8 @@ def get_content_of_website(html, word_count_threshold = MIN_WORD_THRESHOLD, css_
'markdown': markdown, 'markdown': markdown,
'cleaned_html': cleaned_html, 'cleaned_html': cleaned_html,
'success': True, 'success': True,
'media': media 'media': media,
'links': links
} }
except Exception as e: except Exception as e:

View File

@@ -116,7 +116,8 @@ class WebCrawler:
"extracted_content": cached[4], "extracted_content": cached[4],
"success": cached[5], "success": cached[5],
"media": json.loads(cached[6] or "{}"), "media": json.loads(cached[6] or "{}"),
"screenshot": cached[7], "links": json.loads(cached[7] or "{}"),
"screenshot": cached[8],
"error_message": "", "error_message": "",
} }
) )
@@ -133,15 +134,16 @@ class WebCrawler:
error_message = "" error_message = ""
# Extract content from HTML # Extract content from HTML
try: try:
result = get_content_of_website(html, word_count_threshold, css_selector=css_selector) result = get_content_of_website(url, html, word_count_threshold, css_selector=css_selector)
if result is None: if result is None:
raise ValueError(f"Failed to extract content from the website: {url}") raise ValueError(f"Failed to extract content from the website: {url}")
except InvalidCSSSelectorError as e: except InvalidCSSSelectorError as e:
raise ValueError(str(e)) raise ValueError(str(e))
cleaned_html = result.get("cleaned_html", html) cleaned_html = result.get("cleaned_html", "")
markdown = result.get("markdown", "") markdown = result.get("markdown", "")
media = result.get("media", []) media = result.get("media", [])
links = result.get("links", [])
# Print a profession LOG style message, show time taken and say crawling is done # Print a profession LOG style message, show time taken and say crawling is done
if verbose: if verbose:
@@ -177,6 +179,7 @@ class WebCrawler:
extracted_content, extracted_content,
success, success,
json.dumps(media), json.dumps(media),
json.dumps(links),
screenshot=base64_image, screenshot=base64_image,
) )
@@ -186,6 +189,7 @@ class WebCrawler:
cleaned_html=cleaned_html, cleaned_html=cleaned_html,
markdown=markdown, markdown=markdown,
media=media, media=media,
links=links,
screenshot=base64_image, screenshot=base64_image,
extracted_content=extracted_content, extracted_content=extracted_content,
success=success, success=success,
@@ -229,3 +233,102 @@ class WebCrawler:
) )
return results return results
def run_less_db(
self,
url: str,
word_count_threshold=MIN_WORD_THRESHOLD,
extraction_strategy: ExtractionStrategy = None,
chunking_strategy: ChunkingStrategy = RegexChunking(),
bypass_cache: bool = False,
css_selector: str = None,
screenshot: bool = False,
verbose=True,
**kwargs,
) -> CrawlResult:
extraction_strategy = extraction_strategy or NoExtractionStrategy()
extraction_strategy.verbose = verbose
if not isinstance(extraction_strategy, ExtractionStrategy):
raise ValueError("Unsupported extraction strategy")
if not isinstance(chunking_strategy, ChunkingStrategy):
raise ValueError("Unsupported chunking strategy")
if word_count_threshold < MIN_WORD_THRESHOLD:
word_count_threshold = MIN_WORD_THRESHOLD
# Check cache first
cached = None
extracted_content = None
if not bypass_cache and not self.always_by_pass_cache:
cached = get_cached_url(url)
if cached:
html = cached[1]
extracted_content = cached[2]
else:
html = self.crawler_strategy.crawl(url)
cache_url(url, html)
return self.process_html(url, html, extracted_content, word_count_threshold, extraction_strategy, chunking_strategy, css_selector, screenshot, verbose, **kwargs)
def process_html(
self,
url: str,
html: str,
extracted_content: str,
word_count_threshold: int,
extraction_strategy: ExtractionStrategy,
chunking_strategy: ChunkingStrategy,
css_selector: str,
screenshot: bool,
verbose: bool,
**kwargs,
) -> CrawlResult:
t = time.time()
base64_image = None
if screenshot:
base64_image = self.crawler_strategy.take_screenshot()
# Extract content from HTML
try:
result = get_content_of_website(url, html, word_count_threshold, css_selector=css_selector)
if result is None:
raise ValueError(f"Failed to extract content from the website: {url}")
except InvalidCSSSelectorError as e:
raise ValueError(str(e))
cleaned_html = result.get("cleaned_html", "")
markdown = result.get("markdown", "")
media = result.get("media", [])
links = result.get("links", [])
if verbose:
print(f"[LOG] 🚀 Crawling done for {url}, success: True, time taken: {time.time() - t} seconds")
if verbose:
print(f"[LOG] 🔥 Extracting semantic blocks for {url}, Strategy: {extraction_strategy.name}")
sections = chunking_strategy.chunk(markdown)
if extracted_content is None:
extracted_content = extraction_strategy.run(url, sections)
extracted_content = json.dumps(extracted_content)
# Cache the extracted content
cache_url(url, html, extracted_content)
if verbose:
print(f"[LOG] 🚀 Extraction done for {url}, time taken: {time.time() - t} seconds.")
return CrawlResult(
url=url,
html=html,
cleaned_html=cleaned_html,
markdown=markdown,
media=media,
links=links,
screenshot=base64_image,
extracted_content=extracted_content,
success=True,
error_message="",
)

Binary file not shown.

After

Width:  |  Height:  |  Size: 375 KiB

View File

@@ -8,6 +8,14 @@ data = {
"screenshot": True, "screenshot": True,
} }
# Example of filtering the content using CSS selectors
# data = {
# "urls": [
# "https://www.nbcnews.com/business"
# ],
# "css_selector": "article",
# "screenshot": True,
# }
# Example of executing a JS script on the page before extracting the content # Example of executing a JS script on the page before extracting the content
# data = { # data = {