- User agent

- Extract Links - Extract Metadata - Update Readme - Update REST API document
2024-06-08 17:59:42 +08:00
parent 9c34b30723
commit b3a0edaa6d
12 changed files with 155 additions and 75 deletions
--- a/docs/examples/assets/basic.png
+++ b/docs/examples/assets/basic.png
--- a/docs/examples/assets/css_js.png
+++ b/docs/examples/assets/css_js.png
--- a/docs/examples/assets/semantic_extraction_cosine.png
+++ b/docs/examples/assets/semantic_extraction_cosine.png
--- a/docs/examples/assets/semantic_extraction_llm.png
+++ b/docs/examples/assets/semantic_extraction_llm.png
--- a/docs/examples/rest_call.py
+++ b/docs/examples/rest_call.py
@@ -1,75 +1,64 @@

 import requests, base64, os

+data = {
+    "urls": ["https://www.nbcnews.com/business"],
+    "screenshot": True,
+}
+
+response = requests.post("https://crawl4ai.com/crawl", json=data) 
+result = response.json()['results'][0]
+print(result.keys())
+# dict_keys(['url', 'html', 'success', 'cleaned_html', 'media', 
+# 'links', 'screenshot', 'markdown', 'extracted_content', 
+# 'metadata', 'error_message'])
+with open("screenshot.png", "wb") as f:
+    f.write(base64.b64decode(result['screenshot']))
+    
+# Example of filtering the content using CSS selectors
+data = {
+    "urls": [
+        "https://www.nbcnews.com/business"
+    ],
+    "css_selector": "article",
+    "screenshot": True,
+}
+
+# Example of executing a JS script on the page before extracting the content
 data = {
    "urls": [
        "https://www.nbcnews.com/business"
    ],
    "screenshot": True,
+    'js' : ["""
+    const loadMoreButton = Array.from(document.querySelectorAll('button')).
+    find(button => button.textContent.includes('Load More'));
+    loadMoreButton && loadMoreButton.click();
+    """]
 }

-# Example of filtering the content using CSS selectors
-# data = {
-#     "urls": [
-#         "https://www.nbcnews.com/business"
-#     ],
-#     "css_selector": "article",
-#     "screenshot": True,
-# }
-
-# Example of executing a JS script on the page before extracting the content
-# data = {
-#     "urls": [
-#         "https://www.nbcnews.com/business"
-#     ],
-#     "screenshot": True,
-#     'js' : ["""
-#     const loadMoreButton = Array.from(document.querySelectorAll('button')).
-#     find(button => button.textContent.includes('Load More'));
-#     loadMoreButton && loadMoreButton.click();
-#     """]
-# }
-
 # Example of using a custom extraction strategy
-# data = {
-#     "urls": [
-#         "https://www.nbcnews.com/business"
-#     ],
-#     "extraction_strategy": "CosineStrategy",
-#     "extraction_strategy_args": {
-#         "semantic_filter": "inflation rent prices"
-#     },
-# }
+data = {
+    "urls": [
+        "https://www.nbcnews.com/business"
+    ],
+    "extraction_strategy": "CosineStrategy",
+    "extraction_strategy_args": {
+        "semantic_filter": "inflation rent prices"
+    },
+}

 # Example of using LLM to extract content
-# data = {
-#     "urls": [
-#         "https://www.nbcnews.com/business"
-#     ],
-#     "extraction_strategy": "LLMExtractionStrategy",
-#     "extraction_strategy_args": {
-#         "provider": "groq/llama3-8b-8192",
-#         "api_token": os.environ.get("GROQ_API_KEY"),
-#         "instruction": """I am interested in only financial news, 
-#         and translate them in French."""
-#     },
-# }
-
-response = requests.post("https://crawl4ai.com/crawl", json=data) 
-result = response.json()['results'][0]
-
-print(result['markdown'])
-print(result['cleaned_html'])
-print(result['media'])
-print(result['extracted_content'])
-with open("screenshot.png", "wb") as f:
-    f.write(base64.b64decode(result['screenshot']))
-
-
-
-
-
-
-
-
+data = {
+    "urls": [
+        "https://www.nbcnews.com/business"
+    ],
+    "extraction_strategy": "LLMExtractionStrategy",
+    "extraction_strategy_args": {
+        "provider": "groq/llama3-8b-8192",
+        "api_token": os.environ.get("GROQ_API_KEY"),
+        "instruction": """I am interested in only financial news, 
+        and translate them in French."""
+    },
+}