Compare commits
11 Commits
0.3.6
...
main-0.3.7
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
ac9d83c72f | ||
|
|
ff9149b5c9 | ||
|
|
32f57c49d6 | ||
|
|
a5f627ba1a | ||
|
|
dbb587d681 | ||
|
|
9ffa34b697 | ||
|
|
740802c491 | ||
|
|
b9ac96c332 | ||
|
|
d06535388a | ||
|
|
ccbe72cfc1 | ||
|
|
768b93140f |
6
.gitignore
vendored
6
.gitignore
vendored
@@ -202,7 +202,9 @@ todo.md
|
|||||||
git_changes.py
|
git_changes.py
|
||||||
git_changes.md
|
git_changes.md
|
||||||
pypi_build.sh
|
pypi_build.sh
|
||||||
|
git_issues.py
|
||||||
|
git_issues.md
|
||||||
|
|
||||||
.tests/
|
.tests/
|
||||||
git_changes.py
|
|
||||||
git_changes.md
|
.issues/
|
||||||
10
README.md
10
README.md
@@ -10,6 +10,14 @@ Crawl4AI simplifies asynchronous web crawling and data extraction, making it acc
|
|||||||
|
|
||||||
> Looking for the synchronous version? Check out [README.sync.md](./README.sync.md). You can also access the previous version in the branch [V0.2.76](https://github.com/unclecode/crawl4ai/blob/v0.2.76).
|
> Looking for the synchronous version? Check out [README.sync.md](./README.sync.md). You can also access the previous version in the branch [V0.2.76](https://github.com/unclecode/crawl4ai/blob/v0.2.76).
|
||||||
|
|
||||||
|
## New update 0.3.6
|
||||||
|
- 🌐 Multi-browser support (Chromium, Firefox, WebKit)
|
||||||
|
- 🖼️ Improved image processing with lazy-loading detection
|
||||||
|
- 🔧 Custom page timeout parameter for better control over crawling behavior
|
||||||
|
- 🕰️ Enhanced handling of delayed content loading
|
||||||
|
- 🔑 Custom headers support for LLM interactions
|
||||||
|
- 🖼️ iframe content extraction for comprehensive page analysis
|
||||||
|
- ⏱️ Flexible timeout and delayed content retrieval options
|
||||||
|
|
||||||
## Try it Now!
|
## Try it Now!
|
||||||
|
|
||||||
@@ -124,7 +132,7 @@ async def main():
|
|||||||
result = await crawler.arun(
|
result = await crawler.arun(
|
||||||
url="https://www.nbcnews.com/business",
|
url="https://www.nbcnews.com/business",
|
||||||
js_code=js_code,
|
js_code=js_code,
|
||||||
css_selector="article.tease-card",
|
css_selector=".wide-tease-item__description",
|
||||||
bypass_cache=True
|
bypass_cache=True
|
||||||
)
|
)
|
||||||
print(result.extracted_content)
|
print(result.extracted_content)
|
||||||
|
|||||||
@@ -23,13 +23,14 @@ class AsyncWebCrawler:
|
|||||||
self,
|
self,
|
||||||
crawler_strategy: Optional[AsyncCrawlerStrategy] = None,
|
crawler_strategy: Optional[AsyncCrawlerStrategy] = None,
|
||||||
always_by_pass_cache: bool = False,
|
always_by_pass_cache: bool = False,
|
||||||
|
base_directory: str = str(Path.home()),
|
||||||
**kwargs,
|
**kwargs,
|
||||||
):
|
):
|
||||||
self.crawler_strategy = crawler_strategy or AsyncPlaywrightCrawlerStrategy(
|
self.crawler_strategy = crawler_strategy or AsyncPlaywrightCrawlerStrategy(
|
||||||
**kwargs
|
**kwargs
|
||||||
)
|
)
|
||||||
self.always_by_pass_cache = always_by_pass_cache
|
self.always_by_pass_cache = always_by_pass_cache
|
||||||
self.crawl4ai_folder = os.path.join(Path.home(), ".crawl4ai")
|
self.crawl4ai_folder = os.path.join(base_directory, ".crawl4ai")
|
||||||
os.makedirs(self.crawl4ai_folder, exist_ok=True)
|
os.makedirs(self.crawl4ai_folder, exist_ok=True)
|
||||||
os.makedirs(f"{self.crawl4ai_folder}/cache", exist_ok=True)
|
os.makedirs(f"{self.crawl4ai_folder}/cache", exist_ok=True)
|
||||||
self.ready = False
|
self.ready = False
|
||||||
|
|||||||
Reference in New Issue
Block a user