Compare commits

...

11 Commits

Author SHA1 Message Date
UncleCode
ac9d83c72f Update gitignore 2024-10-27 19:29:04 +08:00
UncleCode
ff9149b5c9 Merge branch 'main' of https://github.com/unclecode/crawl4ai 2024-10-27 19:28:05 +08:00
UncleCode
32f57c49d6 Merge pull request #194 from IdrisHanafi/feat/customize-crawl-base-directory
Support for custom crawl base directory
2024-10-24 13:09:27 +02:00
Idris Hanafi
a5f627ba1a feat: customize crawl base directory 2024-10-21 17:58:39 -04:00
UncleCode
dbb587d681 Update gitignore 2024-10-17 21:38:48 +08:00
unclecode
9ffa34b697 Update README 2024-10-14 22:58:27 +08:00
unclecode
740802c491 Merge branch '0.3.6' 2024-10-14 22:55:24 +08:00
unclecode
b9ac96c332 Merge branch 'main' of https://github.com/unclecode/crawl4ai 2024-10-14 22:54:23 +08:00
unclecode
d06535388a Update gitignore 2024-10-14 22:53:56 +08:00
UncleCode
ccbe72cfc1 Merge pull request #135 from hitesh22rana/fix/docs-example
docs: fixed css_selector for example
2024-10-13 14:39:07 +08:00
hitesh22rana
768b93140f docs: fixed css_selector for example 2024-10-05 00:25:41 +09:00
3 changed files with 15 additions and 4 deletions

6
.gitignore vendored
View File

@@ -202,7 +202,9 @@ todo.md
git_changes.py git_changes.py
git_changes.md git_changes.md
pypi_build.sh pypi_build.sh
git_issues.py
git_issues.md
.tests/ .tests/
git_changes.py
git_changes.md .issues/

View File

@@ -10,6 +10,14 @@ Crawl4AI simplifies asynchronous web crawling and data extraction, making it acc
> Looking for the synchronous version? Check out [README.sync.md](./README.sync.md). You can also access the previous version in the branch [V0.2.76](https://github.com/unclecode/crawl4ai/blob/v0.2.76). > Looking for the synchronous version? Check out [README.sync.md](./README.sync.md). You can also access the previous version in the branch [V0.2.76](https://github.com/unclecode/crawl4ai/blob/v0.2.76).
## New update 0.3.6
- 🌐 Multi-browser support (Chromium, Firefox, WebKit)
- 🖼️ Improved image processing with lazy-loading detection
- 🔧 Custom page timeout parameter for better control over crawling behavior
- 🕰️ Enhanced handling of delayed content loading
- 🔑 Custom headers support for LLM interactions
- 🖼️ iframe content extraction for comprehensive page analysis
- ⏱️ Flexible timeout and delayed content retrieval options
## Try it Now! ## Try it Now!
@@ -124,7 +132,7 @@ async def main():
result = await crawler.arun( result = await crawler.arun(
url="https://www.nbcnews.com/business", url="https://www.nbcnews.com/business",
js_code=js_code, js_code=js_code,
css_selector="article.tease-card", css_selector=".wide-tease-item__description",
bypass_cache=True bypass_cache=True
) )
print(result.extracted_content) print(result.extracted_content)

View File

@@ -23,13 +23,14 @@ class AsyncWebCrawler:
self, self,
crawler_strategy: Optional[AsyncCrawlerStrategy] = None, crawler_strategy: Optional[AsyncCrawlerStrategy] = None,
always_by_pass_cache: bool = False, always_by_pass_cache: bool = False,
base_directory: str = str(Path.home()),
**kwargs, **kwargs,
): ):
self.crawler_strategy = crawler_strategy or AsyncPlaywrightCrawlerStrategy( self.crawler_strategy = crawler_strategy or AsyncPlaywrightCrawlerStrategy(
**kwargs **kwargs
) )
self.always_by_pass_cache = always_by_pass_cache self.always_by_pass_cache = always_by_pass_cache
self.crawl4ai_folder = os.path.join(Path.home(), ".crawl4ai") self.crawl4ai_folder = os.path.join(base_directory, ".crawl4ai")
os.makedirs(self.crawl4ai_folder, exist_ok=True) os.makedirs(self.crawl4ai_folder, exist_ok=True)
os.makedirs(f"{self.crawl4ai_folder}/cache", exist_ok=True) os.makedirs(f"{self.crawl4ai_folder}/cache", exist_ok=True)
self.ready = False self.ready = False