Merge branch 'main' of https://github.com/unclecode/crawl4ai

2024-06-02 08:06:56 +00:00
parent 9b0f71ba88 e5d401c67c
commit 0d6e9e37ca
2 changed files with 61 additions and 3 deletions
--- a/README.md
+++ b/README.md
@@ -41,6 +41,23 @@ result = crawler.run(url="https://www.nbcnews.com/business")
 print(result) # {url, html, markdown, extracted_content, metadata}
 ```

+If you don't want to install Selenium, you can use the REST API or local server. 
+
+```python
+import requests
+
+data = {
+  "urls": [
+    "https://www.nbcnews.com/business"
+  ],
+  "word_count_threshold": 10,
+  "extraction_strategy": "NoExtractionStrategy",
+}
+
+response = requests.post("https://crawl4ai.com/crawl", json=data) # OR local host if your run locally 
+print(response.json())
+```
+
 Now let's try a complex task. Below is an example of how you can execute JavaScript, filter data using keywords, and use a CSS selector to extract specific content—all in one go!

 1. Instantiate a WebCrawler object.
--- a/pages/app.js
+++ b/pages/app.js
@@ -109,6 +109,19 @@ document.getElementById("crawl-btn").addEventListener("click", () => {
        verbose: true,
    };

+    // import requests
+
+    // data = {
+    //   "urls": [
+    //     "https://www.nbcnews.com/business"
+    //   ],
+    //   "word_count_threshold": 10,
+    //   "extraction_strategy": "NoExtractionStrategy",
+    // }
+    
+    // response = requests.post("https://crawl4ai.com/crawl", json=data) # OR local host if your run locally 
+    // print(response.json())
+
    // save api token to local storage
    localStorage.setItem("api_token", document.getElementById("token-input").value);

@@ -131,18 +144,46 @@ document.getElementById("crawl-btn").addEventListener("click", () => {

            // REMOVE API TOKEN FROM CODE EXAMPLES
            data.extraction_strategy_args.api_token = "your_api_token";
+
+            if (data.extraction_strategy === "NoExtractionStrategy") {
+                delete data.extraction_strategy_args;
+                delete data.extrac_blocks;
+            }
+
+            if (data.chunking_strategy === "RegexChunking") {
+                delete data.chunking_strategy_args;
+            }
+
+            delete data.verbose;
+
+            if (data.css_selector === "") {
+                delete data.css_selector;
+            }
+
+            if (!data.bypass_cache) {
+                delete data.bypass_cache;
+            }
+
+            if (!data.extract_blocks) {
+                delete data.extract_blocks;
+            }
+
+            if (!data.include_raw_html) {
+                delete data.include_raw_html;
+            }
+
            document.getElementById(
                "curl-code"
            ).textContent = `curl -X POST -H "Content-Type: application/json" -d '${JSON.stringify({
                ...data,
                api_token: isLLMExtraction ? "your_api_token" : undefined,
-            }, null, 2)}' http://localhost:8000/crawl`;
+            }, null, 2)}' https://crawl4ai.com/crawl`;

            document.getElementById("python-code").textContent = `import requests\n\ndata = ${JSON.stringify(
                { ...data, api_token: isLLMExtraction ? "your_api_token" : undefined },
                null,
                2
-            )}\n\nresponse = requests.post("http://localhost:8000/crawl", json=data) # OR local host if your run locally \nprint(response.json())`;
+            )}\n\nresponse = requests.post("https://crawl4ai.com/crawl", json=data) # OR local host if your run locally \nprint(response.json())`;

            document.getElementById(
                "nodejs-code"
@@ -150,7 +191,7 @@ document.getElementById("crawl-btn").addEventListener("click", () => {
                { ...data, api_token: isLLMExtraction ? "your_api_token" : undefined },
                null,
                2
-            )};\n\naxios.post("http://localhost:8000/crawl", data) // OR local host if your run locally \n    .then(response => console.log(response.data))\n    .catch(error => console.error(error));`;
+            )};\n\naxios.post("https://crawl4ai.com/crawl", data) // OR local host if your run locally \n    .then(response => console.log(response.data))\n    .catch(error => console.error(error));`;

            document.getElementById(
                "library-code"