61 lines
2.3 KiB
Python
61 lines
2.3 KiB
Python
import json
|
|
import asyncio
|
|
from urllib.parse import quote, urlencode
|
|
from crawl4ai import CrawlerRunConfig, BrowserConfig, AsyncWebCrawler
|
|
|
|
# Scrapeless provides a free anti-detection fingerprint browser client and cloud browsers:
|
|
# https://www.scrapeless.com/en/blog/scrapeless-nstbrowser-strategic-integration
|
|
|
|
async def main():
|
|
# customize browser fingerprint
|
|
fingerprint = {
|
|
"userAgent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.1.2.3 Safari/537.36",
|
|
"platform": "Windows",
|
|
"screen": {
|
|
"width": 1280, "height": 1024
|
|
},
|
|
"localization": {
|
|
"languages": ["zh-HK", "en-US", "en"], "timezone": "Asia/Hong_Kong",
|
|
}
|
|
}
|
|
|
|
fingerprint_json = json.dumps(fingerprint)
|
|
encoded_fingerprint = quote(fingerprint_json)
|
|
|
|
scrapeless_params = {
|
|
"token": "your token",
|
|
"sessionTTL": 1000,
|
|
"sessionName": "Demo",
|
|
"fingerprint": encoded_fingerprint,
|
|
# Sets the target country/region for the proxy, sending requests via an IP address from that region. You can specify a country code (e.g., US for the United States, GB for the United Kingdom, ANY for any country). See country codes for all supported options.
|
|
# "proxyCountry": "ANY",
|
|
# create profile on scrapeless
|
|
# "profileId": "your profileId",
|
|
# For more usage details, please refer to https://docs.scrapeless.com/en/scraping-browser/quickstart/getting-started
|
|
}
|
|
query_string = urlencode(scrapeless_params)
|
|
scrapeless_connection_url = f"wss://browser.scrapeless.com/api/v2/browser?{query_string}"
|
|
async with AsyncWebCrawler(
|
|
config=BrowserConfig(
|
|
headless=False,
|
|
browser_mode="cdp",
|
|
cdp_url=scrapeless_connection_url,
|
|
)
|
|
) as crawler:
|
|
result = await crawler.arun(
|
|
url="https://www.scrapeless.com/en",
|
|
config=CrawlerRunConfig(
|
|
wait_for="css:.content",
|
|
scan_full_page=True,
|
|
),
|
|
)
|
|
print("-" * 20)
|
|
print(f'Status Code: {result.status_code}')
|
|
print("-" * 20)
|
|
print(f'Title: {result.metadata["title"]}')
|
|
print(f'Description: {result.metadata["description"]}')
|
|
print("-" * 20)
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(main())
|
|
|