Merge branch 'main' of https://github.com/unclecode/crawl4ai

2024-06-02 08:06:56 +00:00
parent 9b0f71ba88 e5d401c67c
commit 0d6e9e37ca
2 changed files with 61 additions and 3 deletions
--- a/README.md
+++ b/README.md
@@ -41,6 +41,23 @@ result = crawler.run(url="https://www.nbcnews.com/business")
 print(result) # {url, html, markdown, extracted_content, metadata}
 ```
 If you don't want to install Selenium, you can use the REST API or local server. 
 ```python
 import requests
 data = {
  "urls": [
    "https://www.nbcnews.com/business"
  ],
  "word_count_threshold": 10,
  "extraction_strategy": "NoExtractionStrategy",
 }
 response = requests.post("https://crawl4ai.com/crawl", json=data) # OR local host if your run locally 
 print(response.json())
 ```
 Now let's try a complex task. Below is an example of how you can execute JavaScript, filter data using keywords, and use a CSS selector to extract specific content—all in one go!
 1. Instantiate a WebCrawler object.
--- a/pages/app.js
+++ b/pages/app.js
@@ -109,6 +109,19 @@ document.getElementById("crawl-btn").addEventListener("click", () => {
        verbose: true,
    };
    // import requests
    // data = {
    //   "urls": [
    //     "https://www.nbcnews.com/business"
    //   ],
    //   "word_count_threshold": 10,
    //   "extraction_strategy": "NoExtractionStrategy",
    // }
    // response = requests.post("https://crawl4ai.com/crawl", json=data) # OR local host if your run locally 
    // print(response.json())
    // save api token to local storage
    localStorage.setItem("api_token", document.getElementById("token-input").value);
@@ -131,18 +144,46 @@ document.getElementById("crawl-btn").addEventListener("click", () => {
            // REMOVE API TOKEN FROM CODE EXAMPLES
            data.extraction_strategy_args.api_token = "your_api_token";
            if (data.extraction_strategy === "NoExtractionStrategy") {
                delete data.extraction_strategy_args;
                delete data.extrac_blocks;
            }
            if (data.chunking_strategy === "RegexChunking") {
                delete data.chunking_strategy_args;
            }
            delete data.verbose;
            if (data.css_selector === "") {
                delete data.css_selector;
            }
            if (!data.bypass_cache) {
                delete data.bypass_cache;
            }
            if (!data.extract_blocks) {
                delete data.extract_blocks;
            }
            if (!data.include_raw_html) {
                delete data.include_raw_html;
            }
            document.getElementById(
                "curl-code"
            ).textContent = `curl -X POST -H "Content-Type: application/json" -d '${JSON.stringify({
                ...data,
                api_token: isLLMExtraction ? "your_api_token" : undefined,
-            }, null, 2)}' http://localhost:8000/crawl`;
+            }, null, 2)}' https://crawl4ai.com/crawl`;
            document.getElementById("python-code").textContent = `import requests\n\ndata = ${JSON.stringify(
                { ...data, api_token: isLLMExtraction ? "your_api_token" : undefined },
                null,
                2
-            )}\n\nresponse = requests.post("http://localhost:8000/crawl", json=data) # OR local host if your run locally \nprint(response.json())`;
+            )}\n\nresponse = requests.post("https://crawl4ai.com/crawl", json=data) # OR local host if your run locally \nprint(response.json())`;
            document.getElementById(
                "nodejs-code"
@@ -150,7 +191,7 @@ document.getElementById("crawl-btn").addEventListener("click", () => {
                { ...data, api_token: isLLMExtraction ? "your_api_token" : undefined },
                null,
                2
-            )};\n\naxios.post("http://localhost:8000/crawl", data) // OR local host if your run locally \n    .then(response => console.log(response.data))\n    .catch(error => console.error(error));`;
+            )};\n\naxios.post("https://crawl4ai.com/crawl", data) // OR local host if your run locally \n    .then(response => console.log(response.data))\n    .catch(error => console.error(error));`;
            document.getElementById(
                "library-code"