Sponsors/new (#1637)
This commit is contained in:
61
docs/examples/cloud_browser/scrapeless_browser.py
Normal file
61
docs/examples/cloud_browser/scrapeless_browser.py
Normal file
@@ -0,0 +1,61 @@
|
||||
import json
|
||||
import asyncio
|
||||
from urllib.parse import quote, urlencode
|
||||
from crawl4ai import CrawlerRunConfig, BrowserConfig, AsyncWebCrawler
|
||||
|
||||
# Scrapeless provides a free anti-detection fingerprint browser client and cloud browsers:
|
||||
# https://www.scrapeless.com/en/blog/scrapeless-nstbrowser-strategic-integration
|
||||
|
||||
async def main():
|
||||
# customize browser fingerprint
|
||||
fingerprint = {
|
||||
"userAgent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.1.2.3 Safari/537.36",
|
||||
"platform": "Windows",
|
||||
"screen": {
|
||||
"width": 1280, "height": 1024
|
||||
},
|
||||
"localization": {
|
||||
"languages": ["zh-HK", "en-US", "en"], "timezone": "Asia/Hong_Kong",
|
||||
}
|
||||
}
|
||||
|
||||
fingerprint_json = json.dumps(fingerprint)
|
||||
encoded_fingerprint = quote(fingerprint_json)
|
||||
|
||||
scrapeless_params = {
|
||||
"token": "your token",
|
||||
"sessionTTL": 1000,
|
||||
"sessionName": "Demo",
|
||||
"fingerprint": encoded_fingerprint,
|
||||
# Sets the target country/region for the proxy, sending requests via an IP address from that region. You can specify a country code (e.g., US for the United States, GB for the United Kingdom, ANY for any country). See country codes for all supported options.
|
||||
# "proxyCountry": "ANY",
|
||||
# create profile on scrapeless
|
||||
# "profileId": "your profileId",
|
||||
# For more usage details, please refer to https://docs.scrapeless.com/en/scraping-browser/quickstart/getting-started
|
||||
}
|
||||
query_string = urlencode(scrapeless_params)
|
||||
scrapeless_connection_url = f"wss://browser.scrapeless.com/api/v2/browser?{query_string}"
|
||||
async with AsyncWebCrawler(
|
||||
config=BrowserConfig(
|
||||
headless=False,
|
||||
browser_mode="cdp",
|
||||
cdp_url=scrapeless_connection_url,
|
||||
)
|
||||
) as crawler:
|
||||
result = await crawler.arun(
|
||||
url="https://www.scrapeless.com/en",
|
||||
config=CrawlerRunConfig(
|
||||
wait_for="css:.content",
|
||||
scan_full_page=True,
|
||||
),
|
||||
)
|
||||
print("-" * 20)
|
||||
print(f'Status Code: {result.status_code}')
|
||||
print("-" * 20)
|
||||
print(f'Title: {result.metadata["title"]}')
|
||||
print(f'Description: {result.metadata["description"]}')
|
||||
print("-" * 20)
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
|
||||
Reference in New Issue
Block a user