"""
CampaignHub AI — Realistic Marketing Dataset Generator
Generates 10,000 creators and 15,000 campaign-creator pairings
with realistic engagement metrics based on industry benchmarks.
"""
import pandas as pd
import numpy as np
import random
import os
import json

def generate_dataset():
    print("Generating Realistic Influencer & Campaign Dataset...")
    np.random.seed(42)
    random.seed(42)

    # ═══════════════════════════════════════════════════════════
    # Industry Constants (based on real marketing benchmarks)
    # ═══════════════════════════════════════════════════════════
    CATEGORIES = [
        'fashion', 'tech', 'food', 'fitness', 'beauty',
        'travel', 'gaming', 'lifestyle', 'music', 'education'
    ]
    PLATFORMS = ['Instagram', 'TikTok', 'YouTube', 'Twitter', 'Twitch']
    LOCATIONS = [
        'New York', 'Los Angeles', 'London', 'Dubai', 'Mumbai',
        'Tokyo', 'Berlin', 'Paris', 'Toronto', 'Sydney',
        'Lagos', 'São Paulo', 'Singapore', 'Seoul', 'Amsterdam',
        'Addis Ababa', 'Nairobi', 'Cape Town', 'Cairo', 'Accra'
    ]
    TONES = ['casual', 'professional', 'edgy', 'inspirational', 'humorous']

    # Real-world engagement rate benchmarks by platform (%)
    ENG_BENCHMARKS = {
        'Instagram': (0.02, 0.08),
        'TikTok': (0.04, 0.15),
        'YouTube': (0.015, 0.06),
        'Twitter': (0.01, 0.04),
        'Twitch': (0.03, 0.10),
    }

    # Niche-specific bio templates
    BIO_TEMPLATES = {
        'fashion': [
            "Fashion content creator sharing style tips and outfit inspiration. Collaborating with brands that value aesthetics and authenticity.",
            "Sustainable fashion advocate. Curating looks that are chic, conscious, and camera-ready.",
            "Streetwear enthusiast creating lookbooks and brand stories. DM for collabs.",
            "Fashion blogger | {followers}+ community | Styling tips, hauls, and brand partnerships.",
            "Luxury fashion influencer covering runway trends, designer reviews, and editorial shoots."
        ],
        'tech': [
            "Tech reviewer and gadget enthusiast. Honest reviews, unboxings, and tutorials for the latest devices.",
            "Software engineer by day, tech content creator by night. Covering AI, apps, and innovation.",
            "Building the future one review at a time. Specializing in smartphones, laptops, and smart home tech.",
            "Tech educator creating tutorials, comparisons, and buying guides for everyday consumers.",
            "Developer and tech influencer focused on startups, SaaS, and emerging technology trends."
        ],
        'food': [
            "Food content creator and recipe developer. Partnering with brands that taste as good as they look.",
            "Home chef sharing easy recipes, restaurant reviews, and cooking tips. Let's eat!",
            "Food photographer and blogger. Turning meals into visual stories for brands worldwide.",
            "Plant-based food creator. Vegan recipes, nutrition tips, and sustainable eating habits.",
            "Culinary explorer traveling the world one dish at a time. Food brand collaborations welcome."
        ],
        'fitness': [
            "Certified personal trainer creating workout content, supplement reviews, and transformation stories.",
            "Fitness influencer and wellness advocate. Helping people build healthy habits through content.",
            "CrossFit athlete and content creator. Sharing training routines, nutrition plans, and brand partnerships.",
            "Yoga instructor and mindfulness coach. Creating calm, centered content for health brands.",
            "Bodybuilding competitor and fitness coach. Training videos, meal prep, and supplement partnerships."
        ],
        'beauty': [
            "Beauty content creator specializing in makeup tutorials, skincare routines, and product reviews.",
            "Licensed esthetician sharing professional skincare advice and honest product reviews.",
            "Makeup artist and beauty influencer. Creating looks, tutorials, and brand-sponsored content.",
            "Clean beauty advocate reviewing cruelty-free and sustainable beauty products.",
            "Beauty blogger with a passion for inclusivity. All skin tones, all shades, all welcome."
        ],
        'travel': [
            "Travel content creator exploring hidden gems and luxury destinations worldwide.",
            "Adventure traveler and photographer. Creating cinematic travel stories for tourism brands.",
            "Budget travel expert sharing tips, itineraries, and affordable destination guides.",
            "Digital nomad creating content from 30+ countries. Travel brand collaborations welcome.",
            "Luxury travel influencer reviewing hotels, airlines, and premium travel experiences."
        ],
        'gaming': [
            "Full-time streamer and gaming content creator. Partnering with gaming brands and peripherals.",
            "Competitive esports player creating gameplay content, reviews, and tutorials.",
            "Indie game reviewer and content creator. Supporting small studios and honest gaming journalism.",
            "Gaming setup enthusiast reviewing chairs, monitors, keyboards, and peripherals.",
            "Retro gaming collector and content creator. Nostalgia-driven content for gaming brands."
        ],
        'lifestyle': [
            "Lifestyle content creator sharing daily routines, home organization, and personal development tips.",
            "Minimalist lifestyle advocate. Creating content about intentional living and mindful consumption.",
            "Mom blogger sharing parenting tips, family activities, and lifestyle brand partnerships.",
            "Wellness and lifestyle influencer covering self-care, productivity, and personal growth.",
            "Urban lifestyle creator focused on city living, apartment tours, and everyday essentials."
        ],
        'music': [
            "Independent musician and content creator. Covering new releases, gear reviews, and tutorials.",
            "Music producer sharing behind-the-scenes content, production tips, and audio gear reviews.",
            "Singer-songwriter creating original content and collaborating with music and audio brands.",
            "DJ and electronic music creator. Event coverage, gear reviews, and music brand partnerships.",
            "Music educator and content creator teaching theory, instruments, and production techniques."
        ],
        'education': [
            "Online educator creating courses, tutorials, and study tips for students worldwide.",
            "STEM content creator making science and math accessible and engaging for all ages.",
            "Language learning content creator. Teaching English, Spanish, and French through social media.",
            "Study tips and productivity content creator. Helping students excel with evidence-based methods.",
            "EdTech reviewer and educator. Covering apps, tools, and platforms that transform learning."
        ]
    }

    # ═══════════════════════════════════════════════════════════
    # Generate 10,000 Creators
    # ═══════════════════════════════════════════════════════════
    num_creators = 10000

    # Realistic follower distribution (power law)
    followers = np.random.lognormal(mean=10, sigma=1.5, size=num_creators).astype(int)
    followers = np.clip(followers, 1000, 10_000_000)

    categories = np.random.choice(CATEGORIES, num_creators)
    platforms = np.random.choice(PLATFORMS, num_creators)
    locations = np.random.choice(LOCATIONS, num_creators)

    # Engagement rate inversely correlated with follower count (realistic)
    base_eng = np.array([
        np.random.uniform(*ENG_BENCHMARKS[p]) for p in platforms
    ])
    follower_penalty = np.log10(followers) / 20.0
    engagement_rates = np.clip(base_eng - follower_penalty + np.random.normal(0, 0.005, num_creators), 0.005, 0.20)

    # Past campaign success (higher for experienced creators)
    experience = np.random.randint(0, 80, num_creators)
    success_rate = np.clip(0.5 + experience * 0.005 + np.random.normal(0, 0.1, num_creators), 0.1, 0.99)

    # Content quality score (1-10)
    content_quality = np.clip(np.random.normal(7.0, 1.5, num_creators), 2.0, 10.0).round(1)

    # Response time in hours
    response_time = np.clip(np.random.exponential(12, num_creators), 0.5, 72.0).round(1)

    # Brand safety score
    brand_safety = np.clip(np.random.normal(0.85, 0.08, num_creators), 0.4, 1.0).round(3)

    # Generate bios
    bios = []
    for i in range(num_creators):
        cat = categories[i]
        templates = BIO_TEMPLATES[cat]
        bio = random.choice(templates).replace('{followers}', f'{followers[i]:,}')
        bios.append(bio)

    # Interest vectors: which campaign types they prefer (one-hot + noise)
    interest_categories = []
    for cat in categories:
        prefs = {c: 0.1 for c in CATEGORIES}
        prefs[cat] = 0.6  # Strong preference for own category
        # Add 1-2 secondary interests
        secondary = random.sample([c for c in CATEGORIES if c != cat], 2)
        for s in secondary:
            prefs[s] = 0.3
        interest_categories.append(prefs)

    creators_df = pd.DataFrame({
        'creator_id': range(1, num_creators + 1),
        'category': categories,
        'platform': platforms,
        'location': locations,
        'followers': followers,
        'engagement_rate': engagement_rates.round(4),
        'past_campaigns': experience,
        'success_rate': success_rate.round(3),
        'content_quality': content_quality,
        'response_time_hrs': response_time,
        'brand_safety': brand_safety,
        'bio': bios,
        'interests': [json.dumps(ic) for ic in interest_categories]
    })

    # ═══════════════════════════════════════════════════════════
    # Generate 15,000 Campaign-Creator Pairings
    # ═══════════════════════════════════════════════════════════
    num_campaigns = 15000

    campaign_budgets = np.random.lognormal(mean=8, sigma=1.0, size=num_campaigns).astype(int)
    campaign_budgets = np.clip(campaign_budgets, 500, 100000)
    campaign_categories = np.random.choice(CATEGORIES, num_campaigns)
    campaign_platforms = np.random.choice(PLATFORMS, num_campaigns)
    campaign_tones = np.random.choice(TONES, num_campaigns)

    # Sample creators for each campaign
    assigned = creators_df.sample(n=num_campaigns, replace=True)

    # Category match flag
    cat_match = (campaign_categories == assigned['category'].values).astype(float)

    # Platform match flag
    plat_match = (campaign_platforms == assigned['platform'].values).astype(float)

    # Reach: f(followers, budget, category_match, platform_match)
    reach_base = assigned['followers'].values * np.random.uniform(0.3, 0.7, num_campaigns)
    budget_boost = np.log1p(campaign_budgets) * 500
    cat_boost = cat_match * assigned['followers'].values * 0.15
    plat_boost = plat_match * assigned['followers'].values * 0.10
    noise = np.random.normal(1.0, 0.08, num_campaigns)
    actual_reach = ((reach_base + budget_boost + cat_boost + plat_boost) * noise).astype(int)
    actual_reach = np.clip(actual_reach, 500, None)

    # Engagement: f(reach, engagement_rate, category_match, content_quality)
    eng_base = actual_reach * assigned['engagement_rate'].values
    quality_mult = assigned['content_quality'].values / 7.0
    eng_noise = np.random.normal(1.0, 0.1, num_campaigns)
    actual_engagement = (eng_base * quality_mult * (1 + cat_match * 0.2) * eng_noise).astype(int)
    actual_engagement = np.clip(actual_engagement, 10, None)

    # ROI score: engagement per dollar spent
    roi_raw = (actual_engagement / np.maximum(campaign_budgets, 1)) * 100
    roi_score = np.clip(roi_raw, 1, 99).round(1)

    # Applicant qualification (binary: was this a good match?)
    qualification_score = (
        cat_match * 25 +
        plat_match * 15 +
        assigned['success_rate'].values * 20 +
        assigned['content_quality'].values * 3 +
        assigned['brand_safety'].values * 15 +
        np.clip(20 - assigned['response_time_hrs'].values * 0.3, 0, 10)
    )
    qualification_score = np.clip(qualification_score + np.random.normal(0, 3, num_campaigns), 10, 99).round(1)

    # Was the creator accepted? (threshold-based with noise)
    was_accepted = (qualification_score > 55).astype(int)

    dataset = pd.DataFrame({
        'campaign_id': range(1, num_campaigns + 1),
        'campaign_category': campaign_categories,
        'campaign_platform': campaign_platforms,
        'campaign_tone': campaign_tones,
        'budget': campaign_budgets,
        'creator_id': assigned['creator_id'].values,
        'creator_category': assigned['category'].values,
        'creator_platform': assigned['platform'].values,
        'creator_followers': assigned['followers'].values,
        'creator_engagement_rate': assigned['engagement_rate'].values,
        'creator_content_quality': assigned['content_quality'].values,
        'creator_success_rate': assigned['success_rate'].values,
        'creator_response_time': assigned['response_time_hrs'].values,
        'creator_brand_safety': assigned['brand_safety'].values,
        'category_match': cat_match.astype(int),
        'platform_match': plat_match.astype(int),
        'actual_reach': actual_reach,
        'actual_engagement': actual_engagement,
        'roi_score': roi_score,
        'qualification_score': qualification_score,
        'was_accepted': was_accepted
    })

    # ═══════════════════════════════════════════════════════════
    # Save
    # ═══════════════════════════════════════════════════════════
    os.makedirs('data', exist_ok=True)
    dataset.to_csv('data/campaign_performance.csv', index=False)
    creators_df.to_csv('data/creator_corpus.csv', index=False)

    print(f"Dataset generated: {len(creators_df)} creators, {len(dataset)} campaign pairings")
    print(f"Saved to data/ directory.")

if __name__ == "__main__":
    generate_dataset()
