diff --git a/crawl4ai/crawler_strategy.py b/crawl4ai/crawler_strategy.py index cd94e9e7..3d40c528 100644 --- a/crawl4ai/crawler_strategy.py +++ b/crawl4ai/crawler_strategy.py @@ -9,7 +9,7 @@ from selenium.common.exceptions import InvalidArgumentException, WebDriverExcept from selenium.webdriver.chrome.service import Service as ChromeService from webdriver_manager.chrome import ChromeDriverManager -import logging +import logging, time import base64 from PIL import Image, ImageDraw, ImageFont from io import BytesIO @@ -177,7 +177,19 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy): # Set extra HTTP headers self.driver.execute_cdp_cmd('Network.setExtraHTTPHeaders', {'headers': headers}) + def _ensure_page_load(self, max_checks=6, check_interval=0.01): + initial_length = len(self.driver.page_source) + + for ix in range(max_checks): + print(f"Checking page load: {ix}") + time.sleep(check_interval) + current_length = len(self.driver.page_source) + + if current_length != initial_length: + break + return self.driver.page_source + def crawl(self, url: str) -> str: # Create md5 hash of the URL import hashlib @@ -194,10 +206,14 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy): if self.verbose: print(f"[LOG] 🕸️ Crawling {url} using LocalSeleniumCrawlerStrategy...") self.driver.get(url) # - html = self.driver.page_source + + WebDriverWait(self.driver, 20).until( + lambda d: d.execute_script('return document.readyState') == 'complete' + ) WebDriverWait(self.driver, 10).until( EC.presence_of_all_elements_located((By.TAG_NAME, "body")) ) + html = self._ensure_page_load() # self.driver.page_source can_not_be_done_headless = False # Look at my creativity for naming variables # TODO: Very ugly way for now but it works if html == "": diff --git a/crawl4ai/utils.py b/crawl4ai/utils.py index c468c49a..be6337de 100644 --- a/crawl4ai/utils.py +++ b/crawl4ai/utils.py @@ -439,71 +439,75 @@ def get_content_of_website_optimized(url: str, html: str, word_count_threshold: media = {'images': [], 'videos': [], 'audios': []} def process_element(element: element.PageElement) -> bool: - if isinstance(element, NavigableString): - if isinstance(element, Comment): - element.extract() - return False + try: + if isinstance(element, NavigableString): + if isinstance(element, Comment): + element.extract() + return False - if element.name in ['script', 'style', 'link', 'meta', 'noscript']: - element.decompose() - return False + if element.name in ['script', 'style', 'link', 'meta', 'noscript']: + element.decompose() + return False - keep_element = False + keep_element = False - if element.name == 'a' and element.get('href'): - href = element['href'] - url_base = url.split('/')[2] - link_data = {'href': href, 'text': element.get_text()} - if href.startswith('http') and url_base not in href: - links['external'].append(link_data) - else: - links['internal'].append(link_data) - keep_element = True - - elif element.name == 'img': - media['images'].append({ - 'src': element.get('src'), - 'alt': element.get('alt'), - 'type': 'image' - }) - return True # Always keep image elements - - elif element.name in ['video', 'audio']: - media[f"{element.name}s"].append({ - 'src': element.get('src'), - 'alt': element.get('alt'), - 'type': element.name - }) - return True # Always keep video and audio elements - - if element.name != 'pre': - if element.name in ['b', 'i', 'u', 'span', 'del', 'ins', 'sub', 'sup', 'strong', 'em', 'code', 'kbd', 'var', 's', 'q', 'abbr', 'cite', 'dfn', 'time', 'small', 'mark']: - if kwargs.get('only_text', False): - element.replace_with(element.get_text()) + if element.name == 'a' and element.get('href'): + href = element['href'] + url_base = url.split('/')[2] + link_data = {'href': href, 'text': element.get_text()} + if href.startswith('http') and url_base not in href: + links['external'].append(link_data) else: - element.unwrap() - elif element.name != 'img': - element.attrs = {} + links['internal'].append(link_data) + keep_element = True - # Process children - for child in list(element.children): - if isinstance(child, NavigableString) and not isinstance(child, Comment): - if len(child.strip()) > 0: - keep_element = True - else: - if process_element(child): - keep_element = True - + elif element.name == 'img': + media['images'].append({ + 'src': element.get('src'), + 'alt': element.get('alt'), + 'type': 'image' + }) + return True # Always keep image elements - # Check word count - if not keep_element: - word_count = len(element.get_text(strip=True).split()) - keep_element = word_count >= word_count_threshold + elif element.name in ['video', 'audio']: + media[f"{element.name}s"].append({ + 'src': element.get('src'), + 'alt': element.get('alt'), + 'type': element.name + }) + return True # Always keep video and audio elements - if not keep_element: - element.decompose() + if element.name != 'pre': + if element.name in ['b', 'i', 'u', 'span', 'del', 'ins', 'sub', 'sup', 'strong', 'em', 'code', 'kbd', 'var', 's', 'q', 'abbr', 'cite', 'dfn', 'time', 'small', 'mark']: + if kwargs.get('only_text', False): + element.replace_with(element.get_text()) + else: + element.unwrap() + elif element.name != 'img': + element.attrs = {} - return keep_element + # Process children + for child in list(element.children): + if isinstance(child, NavigableString) and not isinstance(child, Comment): + if len(child.strip()) > 0: + keep_element = True + else: + if process_element(child): + keep_element = True + + + # Check word count + if not keep_element: + word_count = len(element.get_text(strip=True).split()) + keep_element = word_count >= word_count_threshold + + if not keep_element: + element.decompose() + + return keep_element + except Exception as e: + print('Error processing element:', str(e)) + return False process_element(body)