import sqlite3 import json import random from datetime import datetime, timedelta from database import DatabaseManager def generate_slug(text): return text.lower().replace(' ', '-').replace('&', 'and') def generate_dummy_data(): db = DatabaseManager() conn = db.conn cursor = conn.cursor() # Clear existing data for table in ['apps', 'articles', 'categories', 'sponsors']: cursor.execute(f"DELETE FROM {table}") # Categories categories = [ ("Browser Automation", "⚙", "Tools for browser automation and control"), ("Proxy Services", "🔒", "Proxy providers and rotation services"), ("LLM Integration", "🤖", "AI/LLM tools and integrations"), ("Data Processing", "📊", "Data extraction and processing tools"), ("Cloud Infrastructure", "☁", "Cloud browser and computing services"), ("Developer Tools", "🛠", "Development and testing utilities") ] for i, (name, icon, desc) in enumerate(categories): cursor.execute(""" INSERT INTO categories (name, slug, icon, description, order_index) VALUES (?, ?, ?, ?, ?) """, (name, generate_slug(name), icon, desc, i)) # Apps with real Unsplash images apps_data = [ # Browser Automation ("Playwright Cloud", "Browser Automation", "Paid", True, True, "Scalable browser automation in the cloud with Playwright", "https://playwright.cloud", None, "$99/month starter", 4.8, 12500, "https://images.unsplash.com/photo-1633356122544-f134324a6cee?w=800&h=400&fit=crop"), ("Selenium Grid Hub", "Browser Automation", "Freemium", False, False, "Distributed Selenium grid for parallel testing", "https://seleniumhub.io", "https://github.com/seleniumhub/grid", "Free - $299/month", 4.2, 8400, "https://images.unsplash.com/photo-1555066931-4365d14bab8c?w=800&h=400&fit=crop"), ("Puppeteer Extra", "Browser Automation", "Open Source", True, False, "Enhanced Puppeteer with stealth plugins and more", "https://puppeteer-extra.dev", "https://github.com/berstend/puppeteer-extra", "Free", 4.6, 15200, "https://images.unsplash.com/photo-1461749280684-dccba630e2f6?w=800&h=400&fit=crop"), # Proxy Services ("BrightData", "Proxy Services", "Paid", True, True, "Premium proxy network with 72M+ IPs worldwide", "https://brightdata.com", None, "Starting $500/month", 4.7, 9800, "https://images.unsplash.com/photo-1558494949-ef010cbdcc31?w=800&h=400&fit=crop"), ("SmartProxy", "Proxy Services", "Paid", False, True, "Residential and datacenter proxies with rotation", "https://smartproxy.com", None, "Starting $75/month", 4.3, 7600, "https://images.unsplash.com/photo-1544197150-b99a580bb7a8?w=800&h=400&fit=crop"), ("ProxyMesh", "Proxy Services", "Freemium", False, False, "Rotating proxy servers with sticky sessions", "https://proxymesh.com", None, "$10-$50/month", 4.0, 4200, "https://images.unsplash.com/photo-1451187580459-43490279c0fa?w=800&h=400&fit=crop"), # LLM Integration ("LangChain Crawl", "LLM Integration", "Open Source", True, False, "LangChain integration for Crawl4AI workflows", "https://langchain-crawl.dev", "https://github.com/langchain/crawl", "Free", 4.5, 18900, "https://images.unsplash.com/photo-1677442136019-21780ecad995?w=800&h=400&fit=crop"), ("GPT Scraper", "LLM Integration", "Freemium", False, False, "Extract structured data using GPT models", "https://gptscraper.ai", None, "Free - $99/month", 4.1, 5600, "https://images.unsplash.com/photo-1655720828018-edd2daec9349?w=800&h=400&fit=crop"), ("Claude Extract", "LLM Integration", "Paid", True, True, "Professional extraction using Claude AI", "https://claude-extract.com", None, "$199/month", 4.9, 3200, "https://images.unsplash.com/photo-1686191128892-3b09ad503b4f?w=800&h=400&fit=crop"), # Data Processing ("DataMiner Pro", "Data Processing", "Paid", False, False, "Advanced data extraction and transformation", "https://dataminer.pro", None, "$149/month", 4.2, 6700, "https://images.unsplash.com/photo-1551288049-bebda4e38f71?w=800&h=400&fit=crop"), ("ScraperAPI", "Data Processing", "Freemium", True, True, "Simple API for web scraping with proxy rotation", "https://scraperapi.com", None, "Free - $299/month", 4.6, 22300, "https://images.unsplash.com/photo-1460925895917-afdab827c52f?w=800&h=400&fit=crop"), ("Apify", "Data Processing", "Freemium", False, False, "Web scraping and automation platform", "https://apify.com", None, "$49-$499/month", 4.4, 14500, "https://images.unsplash.com/photo-1504639725590-34d0984388bd?w=800&h=400&fit=crop"), # Cloud Infrastructure ("BrowserCloud", "Cloud Infrastructure", "Paid", True, True, "Managed headless browsers in the cloud", "https://browsercloud.io", None, "$199/month", 4.5, 8900, "https://images.unsplash.com/photo-1667372393119-3d4c48d07fc9?w=800&h=400&fit=crop"), ("LambdaTest", "Cloud Infrastructure", "Freemium", False, False, "Cross-browser testing on cloud", "https://lambdatest.com", None, "Free - $99/month", 4.1, 11200, "https://images.unsplash.com/photo-1451187580459-43490279c0fa?w=800&h=400&fit=crop"), ("Browserless", "Cloud Infrastructure", "Freemium", True, False, "Headless browser automation API", "https://browserless.io", None, "$50-$500/month", 4.7, 19800, "https://images.unsplash.com/photo-1639762681485-074b7f938ba0?w=800&h=400&fit=crop"), # Developer Tools ("Crawl4AI VSCode", "Developer Tools", "Open Source", True, False, "VSCode extension for Crawl4AI development", "https://marketplace.visualstudio.com", "https://github.com/crawl4ai/vscode", "Free", 4.8, 34500, "https://images.unsplash.com/photo-1629654297299-c8506221ca97?w=800&h=400&fit=crop"), ("Postman Collection", "Developer Tools", "Open Source", False, False, "Postman collection for Crawl4AI API testing", "https://postman.com/crawl4ai", "https://github.com/crawl4ai/postman", "Free", 4.3, 7800, "https://images.unsplash.com/photo-1599507593499-a3f7d7d97667?w=800&h=400&fit=crop"), ("Debug Toolkit", "Developer Tools", "Open Source", False, False, "Debugging tools for crawler development", "https://debug.crawl4ai.com", "https://github.com/crawl4ai/debug", "Free", 4.0, 4300, "https://images.unsplash.com/photo-1515879218367-8466d910aaa4?w=800&h=400&fit=crop"), ] for name, category, type_, featured, sponsored, desc, url, github, pricing, rating, downloads, image in apps_data: screenshots = json.dumps([ f"https://images.unsplash.com/photo-{random.randint(1500000000000, 1700000000000)}-{random.randint(1000000000000, 9999999999999)}?w=800&h=600&fit=crop", f"https://images.unsplash.com/photo-{random.randint(1500000000000, 1700000000000)}-{random.randint(1000000000000, 9999999999999)}?w=800&h=600&fit=crop" ]) cursor.execute(""" INSERT INTO apps (name, slug, description, category, type, featured, sponsored, website_url, github_url, pricing, rating, downloads, image, screenshots, logo_url, integration_guide, contact_email, views) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) """, (name, generate_slug(name), desc, category, type_, featured, sponsored, url, github, pricing, rating, downloads, image, screenshots, f"https://ui-avatars.com/api/?name={name}&background=50ffff&color=070708&size=128", f"# {name} Integration\n\n```python\nfrom crawl4ai import AsyncWebCrawler\n# Integration code coming soon...\n```", f"contact@{generate_slug(name)}.com", random.randint(100, 5000))) # Articles with real images articles_data = [ ("Browser Automation Showdown: Playwright vs Puppeteer vs Selenium", "Review", "John Doe", ["Playwright Cloud", "Puppeteer Extra"], ["browser-automation", "comparison", "2024"], "https://images.unsplash.com/photo-1587620962725-abab7fe55159?w=1200&h=630&fit=crop"), ("Top 5 Proxy Services for Web Scraping in 2024", "Comparison", "Jane Smith", ["BrightData", "SmartProxy", "ProxyMesh"], ["proxy", "web-scraping", "guide"], "https://images.unsplash.com/photo-1558494949-ef010cbdcc31?w=1200&h=630&fit=crop"), ("Integrating LLMs with Crawl4AI: A Complete Guide", "Tutorial", "Crawl4AI Team", ["LangChain Crawl", "GPT Scraper", "Claude Extract"], ["llm", "integration", "tutorial"], "https://images.unsplash.com/photo-1677442136019-21780ecad995?w=1200&h=630&fit=crop"), ("Building Scalable Crawlers with Cloud Infrastructure", "Tutorial", "Mike Johnson", ["BrowserCloud", "Browserless"], ["cloud", "scalability", "architecture"], "https://images.unsplash.com/photo-1667372393119-3d4c48d07fc9?w=1200&h=630&fit=crop"), ("What's New in Crawl4AI Marketplace", "News", "Crawl4AI Team", [], ["marketplace", "announcement", "news"], "https://images.unsplash.com/photo-1556075798-4825dfaaf498?w=1200&h=630&fit=crop"), ("Cost Analysis: Self-Hosted vs Cloud Browser Solutions", "Comparison", "Sarah Chen", ["BrowserCloud", "LambdaTest", "Browserless"], ["cost", "cloud", "comparison"], "https://images.unsplash.com/photo-1554224155-8d04cb21cd6c?w=1200&h=630&fit=crop"), ("Getting Started with Browser Automation", "Tutorial", "Crawl4AI Team", ["Playwright Cloud", "Selenium Grid Hub"], ["beginner", "tutorial", "automation"], "https://images.unsplash.com/photo-1498050108023-c5249f4df085?w=1200&h=630&fit=crop"), ("The Future of Web Scraping: AI-Powered Extraction", "News", "Dr. Alan Turing", ["Claude Extract", "GPT Scraper"], ["ai", "future", "trends"], "https://images.unsplash.com/photo-1593720213428-28a5b9e94613?w=1200&h=630&fit=crop") ] for title, category, author, related_apps, tags, image in articles_data: # Get app IDs for related apps related_ids = [] for app_name in related_apps: cursor.execute("SELECT id FROM apps WHERE name = ?", (app_name,)) result = cursor.fetchone() if result: related_ids.append(result[0]) content = f"""# {title} By {author} | {datetime.now().strftime('%B %d, %Y')} ## Introduction This is a comprehensive article about {title.lower()}. Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. ## Key Points - Important point about the topic - Another crucial insight - Technical details and specifications - Performance comparisons ## Conclusion In summary, this article explored various aspects of the topic. Stay tuned for more updates! """ cursor.execute(""" INSERT INTO articles (title, slug, content, author, category, related_apps, featured_image, tags, views) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?) """, (title, generate_slug(title), content, author, category, json.dumps(related_ids), image, json.dumps(tags), random.randint(200, 10000))) # Sponsors sponsors_data = [ ("BrightData", "Gold", "https://brightdata.com", "https://images.unsplash.com/photo-1558494949-ef010cbdcc31?w=728&h=90&fit=crop"), ("ScraperAPI", "Gold", "https://scraperapi.com", "https://images.unsplash.com/photo-1460925895917-afdab827c52f?w=728&h=90&fit=crop"), ("BrowserCloud", "Silver", "https://browsercloud.io", "https://images.unsplash.com/photo-1667372393119-3d4c48d07fc9?w=728&h=90&fit=crop"), ("Claude Extract", "Silver", "https://claude-extract.com", "https://images.unsplash.com/photo-1686191128892-3b09ad503b4f?w=728&h=90&fit=crop"), ("SmartProxy", "Bronze", "https://smartproxy.com", "https://images.unsplash.com/photo-1544197150-b99a580bb7a8?w=728&h=90&fit=crop") ] for company, tier, landing_url, banner in sponsors_data: start_date = datetime.now() - timedelta(days=random.randint(1, 30)) end_date = datetime.now() + timedelta(days=random.randint(30, 180)) cursor.execute(""" INSERT INTO sponsors (company_name, logo_url, tier, banner_url, landing_url, active, start_date, end_date) VALUES (?, ?, ?, ?, ?, ?, ?, ?) """, (company, f"https://ui-avatars.com/api/?name={company}&background=09b5a5&color=fff&size=200", tier, banner, landing_url, 1, start_date.isoformat(), end_date.isoformat())) conn.commit() print("✓ Dummy data generated successfully!") print(f" - {len(categories)} categories") print(f" - {len(apps_data)} apps") print(f" - {len(articles_data)} articles") print(f" - {len(sponsors_data)} sponsors") if __name__ == "__main__": generate_dummy_data()