- User agent

- Extract Links - Extract Metadata - Update Readme - Update REST API document
2024-06-08 17:59:42 +08:00
parent 9c34b30723
commit b3a0edaa6d
12 changed files with 155 additions and 75 deletions
--- a/crawl4ai/crawler_strategy.py
+++ b/crawl4ai/crawler_strategy.py
@@ -44,6 +44,10 @@ class CrawlerStrategy(ABC):
    @abstractmethod
    def take_screenshot(self, save_path: str):
        pass
+    
+    @abstractmethod
+    def update_user_agent(self, user_agent: str):
+        pass

 class CloudCrawlerStrategy(CrawlerStrategy):
    def __init__(self, use_cached_html = False):
@@ -69,6 +73,8 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
        print("[LOG] 🚀 Initializing LocalSeleniumCrawlerStrategy")
        self.options = Options()
        self.options.headless = True
+        if kwargs.get("user_agent"):
+            self.options.add_argument("--user-agent=" + kwargs.get("user_agent"))
        self.options.add_argument("--no-sandbox")
        self.options.add_argument("--headless")
        # self.options.add_argument("--disable-dev-shm-usage")
@@ -97,6 +103,11 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
        self.service.log_path = "NUL"
        self.driver = webdriver.Chrome(service=self.service, options=self.options)

+    def update_user_agent(self, user_agent: str):
+        self.options.add_argument(f"user-agent={user_agent}")
+        self.driver.quit()
+        self.driver = webdriver.Chrome(service=self.service, options=self.options)
+
    def crawl(self, url: str) -> str:
        if self.use_cached_html:
            cache_file_path = os.path.join(Path.home(), ".crawl4ai", "cache", url.replace("/", "_"))
--- a/crawl4ai/database.py
+++ b/crawl4ai/database.py
@@ -21,6 +21,7 @@ def init_db():
            success BOOLEAN,
            media TEXT DEFAULT "{}",
            link TEXT DEFAULT "{}",
+            metadata TEXT DEFAULT "{}",
            screenshot TEXT DEFAULT ""
        )
    ''')
@@ -42,12 +43,12 @@ def check_db_path():
    if not DB_PATH:
        raise ValueError("Database path is not set or is empty.")

-def get_cached_url(url: str) -> Optional[Tuple[str, str, str, str, str, str, bool, str]]:
+def get_cached_url(url: str) -> Optional[Tuple[str, str, str, str, str, str, str, bool, str]]:
    check_db_path()
    try:
        conn = sqlite3.connect(DB_PATH)
        cursor = conn.cursor()
-        cursor.execute('SELECT url, html, cleaned_html, markdown, extracted_content, success, media, links, screenshot FROM crawled_data WHERE url = ?', (url,))
+        cursor.execute('SELECT url, html, cleaned_html, markdown, extracted_content, success, media, links, metadata, screenshot FROM crawled_data WHERE url = ?', (url,))
        result = cursor.fetchone()
        conn.close()
        return result
@@ -55,14 +56,14 @@ def get_cached_url(url: str) -> Optional[Tuple[str, str, str, str, str, str, boo
        print(f"Error retrieving cached URL: {e}")
        return None

-def cache_url(url: str, html: str, cleaned_html: str, markdown: str, extracted_content: str, success: bool, media : str = "{}", links : str = "{}", screenshot: str = ""):
+def cache_url(url: str, html: str, cleaned_html: str, markdown: str, extracted_content: str, success: bool, media : str = "{}", links : str = "{}", metadata : str = "{}", screenshot: str = ""):
    check_db_path()
    try:
        conn = sqlite3.connect(DB_PATH)
        cursor = conn.cursor()
        cursor.execute('''
-            INSERT INTO crawled_data (url, html, cleaned_html, markdown, extracted_content, success, media, links, screenshot)
-            VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
+            INSERT INTO crawled_data (url, html, cleaned_html, markdown, extracted_content, success, media, links, metadata, screenshot)
+            VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
            ON CONFLICT(url) DO UPDATE SET
                html = excluded.html,
                cleaned_html = excluded.cleaned_html,
@@ -70,9 +71,10 @@ def cache_url(url: str, html: str, cleaned_html: str, markdown: str, extracted_c
                extracted_content = excluded.extracted_content,
                success = excluded.success,
                media = excluded.media,      
-                links = excluded.links,          
+                links = excluded.links,    
+                metadata = excluded.metadata,      
                screenshot = excluded.screenshot
-        ''', (url, html, cleaned_html, markdown, extracted_content, success, media, links, screenshot))
+        ''', (url, html, cleaned_html, markdown, extracted_content, success, media, links, metadata, screenshot))
        conn.commit()
        conn.close()
    except Exception as e:
@@ -126,5 +128,5 @@ def update_existing_records(new_column: str = "media", default_value: str = "{}"

 if __name__ == "__main__":
    init_db()  # Initialize the database if not already initialized
-    alter_db_add_screenshot("links")  # Add the new column to the table
-    update_existing_records("links")  # Update existing records to set the new column to an empty string
+    alter_db_add_screenshot("metadata")  # Add the new column to the table
+    update_existing_records("metadata")  # Update existing records to set the new column to an empty string
--- a/crawl4ai/utils.py
+++ b/crawl4ai/utils.py
@@ -359,6 +359,47 @@ def get_content_of_website(url, html, word_count_threshold = MIN_WORD_THRESHOLD,
        print('Error processing HTML content:', str(e))
        raise InvalidCSSSelectorError(f"Invalid CSS selector: {css_selector}") from e

+
+
+def extract_metadata(html):
+    metadata = {}
+    
+    if not html:
+        return metadata
+    
+    # Parse HTML content with BeautifulSoup
+    soup = BeautifulSoup(html, 'html.parser')
+
+    # Title
+    title_tag = soup.find('title')
+    metadata['title'] = title_tag.string if title_tag else None
+
+    # Meta description
+    description_tag = soup.find('meta', attrs={'name': 'description'})
+    metadata['description'] = description_tag['content'] if description_tag else None
+
+    # Meta keywords
+    keywords_tag = soup.find('meta', attrs={'name': 'keywords'})
+    metadata['keywords'] = keywords_tag['content'] if keywords_tag else None
+
+    # Meta author
+    author_tag = soup.find('meta', attrs={'name': 'author'})
+    metadata['author'] = author_tag['content'] if author_tag else None
+
+    # Open Graph metadata
+    og_tags = soup.find_all('meta', attrs={'property': lambda value: value and value.startswith('og:')})
+    for tag in og_tags:
+        property_name = tag['property']
+        metadata[property_name] = tag['content']
+
+    # Twitter Card metadata
+    twitter_tags = soup.find_all('meta', attrs={'name': lambda value: value and value.startswith('twitter:')})
+    for tag in twitter_tags:
+        property_name = tag['name']
+        metadata[property_name] = tag['content']
+
+    return metadata
+
 def extract_xml_tags(string):
    tags = re.findall(r'<(\w+)>', string)
    return list(set(tags))
--- a/crawl4ai/web_crawler.py
+++ b/crawl4ai/web_crawler.py
@@ -89,8 +89,11 @@ class WebCrawler:
        css_selector: str = None,
        screenshot: bool = False,
        verbose=True,
+        user_agent: str = None,
        **kwargs,
    ) -> CrawlResult:
+        if user_agent:
+            self.crawler_strategy.update_user_agent(user_agent)
        extraction_strategy = extraction_strategy or NoExtractionStrategy()
        extraction_strategy.verbose = verbose
        # Check if extraction strategy is an instance of ExtractionStrategy if not raise an error
@@ -117,7 +120,8 @@ class WebCrawler:
                        "success": cached[5],
                        "media": json.loads(cached[6] or "{}"),
                        "links": json.loads(cached[7] or "{}"),
-                        "screenshot": cached[8],
+                        "metadata": json.loads(cached[8] or "{}"), # "metadata": "{}
+                        "screenshot": cached[9],
                        "error_message": "",
                    }
                )
@@ -135,6 +139,7 @@ class WebCrawler:
        # Extract content from HTML
        try:
            result = get_content_of_website(url, html, word_count_threshold, css_selector=css_selector)
+            metadata = extract_metadata(html)
            if result is None:
                raise ValueError(f"Failed to extract content from the website: {url}")
        except InvalidCSSSelectorError as e:
@@ -180,6 +185,7 @@ class WebCrawler:
            success,
            json.dumps(media),
            json.dumps(links),
+            json.dumps(metadata),
            screenshot=base64_image,
        )

@@ -190,6 +196,7 @@ class WebCrawler:
            markdown=markdown,
            media=media,
            links=links,
+            metadata=metadata,
            screenshot=base64_image,
            extracted_content=extracted_content,
            success=success,