﻿"""
CampaignHub AI â€” Model Training Pipeline
Trains 7 specialized ML models for campaign management.
"""
import pandas as pd
import numpy as np
import os
import json
import joblib
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, GradientBoostingClassifier
from sklearn.linear_model import LinearRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, accuracy_score

def train():
    print("=" * 60)
    print("CampaignHub AI â€” Training 7 Models")
    print("=" * 60)

    os.makedirs('models', exist_ok=True)

    # Load datasets
    df = pd.read_csv('data/campaign_performance.csv')
    creators_df = pd.read_csv('data/creator_corpus.csv')

    # â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•
    # Model 1: Reach Predictor (RandomForest)
    # â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•
    print("\n[1/7] Training Reach Predictor (RandomForest, 200 trees)...")

    features_reach = ['budget', 'creator_followers', 'creator_engagement_rate',
                      'creator_content_quality', 'category_match', 'platform_match']
    X_reach = df[features_reach]
    y_reach = df['actual_reach']

    X_tr, X_te, y_tr, y_te = train_test_split(X_reach, y_reach, test_size=0.2, random_state=42)
    reach_model = RandomForestRegressor(n_estimators=200, max_depth=15, random_state=42, n_jobs=-1)
    reach_model.fit(X_tr, y_tr)
    mae = mean_absolute_error(y_te, reach_model.predict(X_te))
    print(f"   âœ“ Reach MAE: {mae:,.0f}")
    joblib.dump(reach_model, 'models/reach_predictor.pkl')

    # â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•
    # Model 2: Engagement Predictor (GradientBoosting)
    # â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•
    print("[2/7] Training Engagement Predictor (GradientBoosting)...")

    features_eng = ['budget', 'creator_followers', 'creator_engagement_rate',
                    'creator_content_quality', 'creator_success_rate',
                    'category_match', 'platform_match']
    X_eng = df[features_eng]
    y_eng = df['actual_engagement']

    X_tr, X_te, y_tr, y_te = train_test_split(X_eng, y_eng, test_size=0.2, random_state=42)
    eng_model = GradientBoostingRegressor(n_estimators=150, max_depth=8, learning_rate=0.1, random_state=42)
    eng_model.fit(X_tr, y_tr)
    mae = mean_absolute_error(y_te, eng_model.predict(X_te))
    print(f"   âœ“ Engagement MAE: {mae:,.0f}")
    joblib.dump(eng_model, 'models/engagement_predictor.pkl')

    # â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•
    # Model 3: Applicant Ranker (GradientBoosting Classifier)
    # â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•
    print("[3/7] Training Applicant Ranker (GradientBoosting Classifier)...")

    features_rank = ['creator_followers', 'creator_engagement_rate', 'creator_content_quality',
                     'creator_success_rate', 'creator_response_time', 'creator_brand_safety',
                     'category_match', 'platform_match']
    X_rank = df[features_rank]
    y_rank = df['was_accepted']

    X_tr, X_te, y_tr, y_te = train_test_split(X_rank, y_rank, test_size=0.2, random_state=42)
    rank_model = GradientBoostingClassifier(n_estimators=150, max_depth=6, learning_rate=0.1, random_state=42)
    rank_model.fit(X_tr, y_tr)
    acc = accuracy_score(y_te, rank_model.predict(X_te))
    print(f"   âœ“ Applicant Ranker Accuracy: {acc:.1%}")
    joblib.dump(rank_model, 'models/applicant_ranker.pkl')

    # â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•
    # Model 4: Budget Optimizer (LinearRegression)
    # â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•
    print("[4/7] Training Budget Optimizer (LinearRegression)...")

    X_budget = df[['creator_followers', 'creator_engagement_rate', 'category_match', 'platform_match']]
    y_budget = df['roi_score']

    budget_model = LinearRegression()
    budget_model.fit(X_budget, y_budget)
    joblib.dump(budget_model, 'models/budget_optimizer.pkl')
    print(f"   âœ“ Budget Optimizer trained")

    # â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•
    # Model 5: Smart Match Engine (TF-IDF + KNN)
    # â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•
    print("[5/7] Training Smart Match Engine (TF-IDF + KNN)...")

    tfidf = TfidfVectorizer(max_features=1000, stop_words='english', ngram_range=(1, 2))
    tfidf_matrix = tfidf.fit_transform(creators_df['bio'].fillna(''))

    nn_model = NearestNeighbors(n_neighbors=15, metric='cosine', algorithm='brute')
    nn_model.fit(tfidf_matrix)

    joblib.dump(tfidf, 'models/tfidf_vectorizer.pkl')
    joblib.dump(nn_model, 'models/nn_matcher.pkl')
    creators_df.to_pickle('models/creators_db.pkl')
    print(f"   âœ“ TF-IDF features: {tfidf_matrix.shape[1]}, KNN neighbors: 15")

    # â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•
    # Model 6: Creator Recommender (SVD Collaborative Filtering)
    # â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•
    print("[6/7] Training Creator Recommender (TruncatedSVD)...")

    # Build creator-category interaction matrix from interests
    categories = ['fashion', 'tech', 'food', 'fitness', 'beauty',
                  'travel', 'gaming', 'lifestyle', 'music', 'education']

    interest_matrix = np.zeros((len(creators_df), len(categories)))
    for i, row in creators_df.iterrows():
        try:
            prefs = json.loads(row['interests'])
            for j, cat in enumerate(categories):
                interest_matrix[i, j] = prefs.get(cat, 0.1)
        except:
            interest_matrix[i] = 0.1

    # Dimensionality reduction
    svd = TruncatedSVD(n_components=min(8, len(categories) - 1), random_state=42)
    creator_embeddings = svd.fit_transform(interest_matrix)

    joblib.dump(svd, 'models/recommender_svd.pkl')
    np.save('models/creator_embeddings.npy', creator_embeddings)
    np.save('models/interest_matrix.npy', interest_matrix)
    print(f"   âœ“ SVD components: {svd.n_components}, Explained variance: {svd.explained_variance_ratio_.sum():.1%}")

    # â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•
    # Model 7: Qualification Score Regressor
    # â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•
    print("[7/7] Training Qualification Score Regressor...")

    X_qual = df[features_rank]
    y_qual = df['qualification_score']

    qual_model = GradientBoostingRegressor(n_estimators=100, max_depth=6, random_state=42)
    qual_model.fit(X_qual, y_qual)
    joblib.dump(qual_model, 'models/qualification_scorer.pkl')
    print(f"   âœ“ Qualification scorer trained")

    print("\n" + "=" * 60)
    print(f"All 7 models saved to models/ directory.")
    print("=" * 60)

if __name__ == "__main__":
    train()

