"""Analyze the saved embed HTML to find engagement data patterns."""
import re

with open('test_embed_http.html', 'r', encoding='utf-8') as f:
    html = f.read()

print(f"HTML length: {len(html)} chars")

# Find all numbers near engagement words
print("\n=== Engagement patterns in raw HTML ===")
patterns = [
    ("num + likes/views/etc", r'(\d[\d,.]*)\s*(?:likes?|views?|comments?|plays?|shares?)'),
    ("liked by X others", r'liked\s+by\s+.*?(\d[\d,.]*)\s+others?'),
    ("X others like", r'(\d[\d,.]*)\s+others?\s+like'),
]
for name, pat in patterns:
    matches = re.findall(pat, html, re.IGNORECASE)
    if matches:
        print(f"  [{name}]: {matches[:10]}")
    else:
        print(f"  [{name}]: not found")

# Find gql data or embedded JSON 
print("\n=== Looking for embedded JSON data ===")
json_patterns = [
    ("__additionalDataLoaded", r'window\.__additionalDataLoaded\s*\(\s*.*?,\s*(\{.+?\})\s*\)'),
    ("window._sharedData", r'window\._sharedData\s*=\s*(\{.+?\})\s*;'),
    ("window.__initialData", r'window\.__initialData\s*=\s*(\{.+?\})'),
]
for name, pat in json_patterns:
    m = re.search(pat, html, re.DOTALL)
    if m:
        print(f"  FOUND {name}! First 500 chars:")
        print(f"  {m.group(1)[:500]}")
    else:
        print(f"  NOT FOUND: {name}")

# Look for any script with engagement data
print("\n=== Scripts containing engagement keywords ===")
scripts = re.findall(r'<script[^>]*>(.*?)</script>', html, re.DOTALL)
print(f"Total script tags: {len(scripts)}")
for i, s in enumerate(scripts):
    lower_s = s.lower()
    if any(kw in lower_s for kw in ['like_count', 'comment_count', 'view_count', 'edge_media', 'engagement']):
        print(f"  Script #{i} has engagement data! Length: {len(s)}")
        print(f"  Preview: {s[:400]}")
        print("  ...")

# Look for meta tags with descriptions
print("\n=== Meta tags with engagement info ===")
metas = re.findall(r'<meta[^>]*content="([^"]+)"[^>]*>', html)
for m in metas:
    lower_m = m.lower()
    if any(kw in lower_m for kw in ['like', 'view', 'comment', 'follower']):
        print(f"  Meta: {m[:200]}")

# Look for specific Instagram embed data structure
print("\n=== Instagram embed specific patterns ===")
embed_patterns = [
    ("gql_data", r'"gql_data"\s*:\s*(\{.+?),"extensions"'),
    ("shortcode_media", r'"shortcode_media"\s*:\s*\{'),
    ("edge_liked_by", r'"edge_liked_by"\s*:\s*\{"count"\s*:\s*(\d+)'),
    ("edge_media_to_comment", r'"edge_media_to_comment"\s*:\s*\{"count"\s*:\s*(\d+)'),
    ("is_video", r'"is_video"\s*:\s*(true|false)'),
    ("display_url", r'"display_url"\s*:\s*"([^"]{20,80})'),
    ("owner_username", r'"owner"\s*:\s*\{[^}]*"username"\s*:\s*"([^"]+)"'),
    ("full_name", r'"full_name"\s*:\s*"([^"]+)"'),
]
for name, pat in embed_patterns:
    m = re.search(pat, html, re.DOTALL)
    if m:
        print(f"  FOUND {name}: {m.group(1)[:100]}")
    else:
        print(f"  NOT FOUND: {name}")

# Last resort: look at visible text content
print("\n=== Unique text snippets containing numbers ===")
# Strip HTML tags to get text
text = re.sub(r'<[^>]+>', ' ', html)
text = re.sub(r'\s+', ' ', text)
# Find all "X likes" / "X views" style patterns
for m in re.finditer(r'(\d[\d,]*)\s+(likes?|views?|comments?|plays?|shares?)', text, re.IGNORECASE):
    print(f"  {m.group(0)}")

print("\nDone!")
