#!/usr/bin/env python3
"""
Instagram Reels Scraper
=======================

This script scrapes view statistics from the latest Reels of a public
Instagram profile.  It relies exclusively on browser automation using
Selenium and does **not** call any official or unofficial APIs.  The
script scrolls through the account's Reels page, opens each reel in a
new tab, extracts the number of views, the publication date, and the
unique slug/ID from the URL, then writes the collected data to a JSON
or CSV file.  A running total of views across the inspected reels is
also reported.

Usage (from the command line)::

    python scrape_instagram_reels.py --username sinkmate.de --reels 20 --output output.json --profile-id 687e41cb8cfee375657ad010 --token your_token

Options:
    --username     Instagram username to scrape (required)
    --reels        Number of latest reels to process (default: 20)
    --output       Output filename, .json or .csv (required)
    --profile-id   GoLogin profile ID (required)
    --token        GoLogin API token (required)
    --delay        Delay in seconds between actions to mimic human behaviour (optional)

Requirements:
    - Python 3.8+
    - Selenium >= 4.0
    - GoLogin Python SDK
    - webdriver-manager

Notes:
    * Uses GoLogin profiles for browser fingerprint management
    * No Instagram account is required to run this script; it uses only
      publicly available data.
    * Web scraping can break if Instagram updates their HTML or CSS.
"""

from __future__ import annotations

import argparse
import contextlib
import csv
import json
import logging
import re
import sys
import time
from dataclasses import dataclass
from datetime import datetime
from pathlib import Path
from typing import List, Optional

from gologin import GoLogin
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException
from selenium.webdriver import ActionChains

from webdriver_manager.chrome import ChromeDriverManager
from pyvirtualdisplay import Display


logging.basicConfig(
    level=logging.INFO,
    format="[%(levelname)s] %(message)s",
)


@dataclass
class ReelInfo:
    """Container for information about a single Instagram reel."""

    url: str
    views: int
    date: Optional[str]
    slug: str

    def to_dict(self) -> dict:
        return {
            "url": self.url,
            "views": self.views,
            "date": self.date,
            "id": self.slug,
        }


def parse_view_count(text: str) -> int:
    """Convert a view count string (e.g. '12,3�Tsd. Aufrufe', '1.2M views') to an integer.

    The view counts on Instagram can be displayed in different locales and with
    abbreviations.  This function normalises the number into an integer.

    Parameters
    ----------
    text: str
        Raw text containing the view count and possibly a label ("Aufrufe", "views").

    Returns
    -------
    int
        Parsed integer view count.  Returns 0 if parsing fails.
    """
    # Remove non-breaking spaces and commas used as thousand separators.
    cleaned = text.replace('\u202f', ' ').replace('\xa0', ' ').strip()
    # Extract the numeric part and the unit (if any)
    match = re.search(r"([\d.,]+)\s*([KMk]|Tsd\.|M|Mn)?", cleaned)
    if not match:
        return 0
    number_str, unit = match.groups()
    # Replace locale-specific decimal separators
    # In German, '.' is thousand separator and ',' is decimal separator.
    number_str = number_str.replace('.', '').replace(',', '.')
    try:
        value = float(number_str)
    except ValueError:
        return 0
    multiplier = 1
    if unit:
        unit = unit.lower()
        if unit in ('k', 'tsd'):
            multiplier = 1_000
        elif unit in ('m', 'mn'):
            multiplier = 1_000_000
    return int(value * multiplier)


def handle_cookies_and_login_prompt(driver: webdriver.Remote, delay: float = 0.0) -> None:
    """Attempt to close cookie banner and login pop-up if present.

    Parameters
    ----------
    driver: webdriver.Remote
        An instance of the Selenium WebDriver.
    delay: float
        Optional sleep delay to slow down actions.
    """
    # Handle cookie banner (various languages).  We'll search for common patterns.
    cookie_selectors = [
        (By.XPATH, "//button[contains(translate(text(),'ABCDEFGHIJKLMNOPQRSTUVWXYZ���','abcdefghijklmnopqrstuvwxyz���'), 'allow') and contains(text(), 'Alle')]"),
        (By.XPATH, "//button[contains(translate(text(),'ABCDEFGHIJKLMNOPQRSTUVWXYZ���','abcdefghijklmnopqrstuvwxyz���'), 'zustimmen')]"),
        (By.XPATH, "//button[contains(translate(text(),'ABCDEFGHIJKLMNOPQRSTUVWXYZ���','abcdefghijklmnopqrstuvwxyz���'), 'akzeptieren')]"),
        (By.XPATH, "//button[contains(translate(text(),'ABCDEFGHIJKLMNOPQRSTUVWXYZ���','abcdefghijklmnopqrstuvwxyz���'), 'optionale cookies ablehnen')]"),
        (By.XPATH, "//button[contains(text(), 'Allow all cookies')]"),
    ]
    for by, selector in cookie_selectors:
        try:
            button = WebDriverWait(driver, 2).until(EC.element_to_be_clickable((by, selector)))
            button.click()
            if delay:
                time.sleep(delay)
            break
        except (TimeoutException, NoSuchElementException):
            continue

    # Close login pop-up if it appears or attempt login
    try:
        # Try to find login form and login if credentials are available
        username_field = driver.find_element(By.XPATH, '//*[@id="loginForm"]/div[1]/div[1]/div/label/input')
        password_field = driver.find_element(By.XPATH, '//*[@id="loginForm"]/div[1]/div[2]/div/label/input')
        login_btn = driver.find_element(By.XPATH, '//*[@id="loginForm"]/div[1]/div[3]/button')
        
        username_field.send_keys("sinkmate.dev")
        password_field.send_keys("B9!n4g64")
        driver.execute_script("arguments[0].click();", login_btn)
        
        if delay:
            time.sleep(delay)
    except (TimeoutException, NoSuchElementException):
        # If login form not found, try to close any overlay
        try:
            driver.find_element(By.TAG_NAME, "body").send_keys(Keys.ESCAPE)
            if delay:
                time.sleep(delay)
        except:
            pass


def scroll_to_load_reels(driver: webdriver.Remote, reels_count: int, delay: float = 0.0) -> List[ReelInfo]:
    """Scrolls and collects reels with URLs and view counts directly from the grid."""
    
    collected: List[ReelInfo] = []
    seen_urls = set()
    attempts = 0
    max_attempts = 30

    while len(collected) < reels_count and attempts < max_attempts:
        reels = driver.find_elements(By.XPATH, "//a[contains(@href, '/reel/')]")
        for a in reels:
            url = a.get_attribute("href")
            if not url or url in seen_urls:
                continue
            seen_urls.add(url)
            
            try:
                # Neuen XPATH verwenden, der gezielt das span-Element mit der View-Anzahl sucht
                # Es befindet sich innerhalb eines div, das wiederum im <a>-Tag liegt.
                span = a.find_element(By.XPATH, ".//div/div/div/span/span[contains(@class, 'x1vvkbs') and contains(@class, 'xdj266r') and contains(@class, 'x14z9mp')]")
                views_text = span.text
                views = parse_view_count(views_text)
                if views == 0:
                    logging.warning(f"Views-Element f�r Reel {url} konnte nicht korrekt ausgewertet werden. Text: '{views_text}'")
                    driver.save_screenshot(f"error_screenshot_{attempts}.png")
                    time.sleep(1)
                    span = a.find_element(By.XPATH, ".//div/div/div/span/span[contains(@class, 'x1vvkbs') and contains(@class, 'xdj266r') and contains(@class, 'x14z9mp')]")
                    views_text = span.text
                    views = parse_view_count(views_text)
                    logging.warning(f"Nächster Versuch: '{views_text}'")

            
            except NoSuchElementException:
                logging.warning(f"Views-Element nicht f�r Reel {url} gefunden. Vermutlich noch nicht geladen.")
                views = 0
            
            slug = url.rstrip('/').split('/')[-1]
            collected.append(ReelInfo(url=url, views=views, date=None, slug=slug))

            if len(collected) >= reels_count:
                break

        # Scroll nach unten
        driver.execute_script("window.scrollBy(0, window.innerHeight);")
        time.sleep(3)
        attempts += 1

    return collected[:reels_count]




def save_results(results: List[ReelInfo], total_views: int, output_path: Path) -> None:
    """Save the list of reel information to JSON or CSV.

    Parameters
    ----------
    results: List[ReelInfo]
        List of extracted reel information.
    total_views: int
        Sum of all views across the processed reels.
    output_path: Path
        Destination file.  The extension determines the format.
    """
    if output_path.suffix.lower() == ".json":
        data = [r.to_dict() for r in results]
        payload = {
            "reels": data,
            "total_views": total_views,
        }
        with output_path.open("w", encoding="utf-8") as f:
            json.dump(payload, f, ensure_ascii=False, indent=2)
    elif output_path.suffix.lower() == ".csv":
        with output_path.open("w", newline="", encoding="utf-8") as csvfile:
            writer = csv.DictWriter(csvfile, fieldnames=["url", "views", "date", "id"])
            writer.writeheader()
            for r in results:
                writer.writerow(r.to_dict())
            # Append total views as a final row with empty url and date
            writer.writerow({"url": "TOTAL", "views": total_views, "date": "", "id": ""})
    else:
        raise ValueError(f"Unsupported output format: {output_path.suffix}")


def run_scraper(username: str, count: int, output: str, delay: float = 0.0, profile_id: str = None, token: str = None) -> None:
    """Main entry point to run the reel scraper with GoLogin.

    Parameters
    ----------
    username: str
        Instagram account username (without '@').
    count: int
        Number of latest reels to process.
    output: str
        Path to output file (.json or .csv).
    delay: float
        Optional delay between actions to avoid detection.
    profile_id: str
        GoLogin profile ID (optional, uses hardcoded value if not provided).
    token: str
        GoLogin API token (optional, uses hardcoded value if not provided).
    """
    # Use hardcoded values if not provided
    PROFILE_ID = profile_id or "685054d57cb66888bd4a4e6f"
    TOKEN = token or (
        "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiI2ODQzMzQ5MjIwMDMwYjQ0OTdlZmVmN2YiLCJ0eXBlIjoiZGV2Iiwiand0aWQiOiI2ODQ0NDA0MDYyNjQ0YWU1NzQ5ODI2MWMifQ.vj-RJv3gm0KpGnb0fAtmipDS2pDnLAFlDik8QKpsc8A"
    )
    
    # Initialize GoLogin
    gl = GoLogin({
        "token": TOKEN,
        "profile_id": PROFILE_ID,
        "extra_params": [
                "--no-sandbox",
                "--disable-dev-shm-usage",
                "--enable-unsafe-swiftshader", 
                "--disable-gpu",
                "--disable-audio-output"
            ]
    })
    display = Display(visible=0, size=(1920, 1080))
    display.start()
    try:
        # Start GoLogin profile
        debugger_address = gl.start()
        chromium_version = gl.get_chromium_version()
        service = Service(ChromeDriverManager(driver_version=chromium_version).install())
        
        # Configure Chrome options for GoLogin
        options = webdriver.ChromeOptions()
        options.add_experimental_option("debuggerAddress", debugger_address)
        options.add_argument("--lang=en-US,en;q=0.9")
        options.add_argument("--disable-blink-features=AutomationControlled")
        options.add_argument("--disable-notifications")
        
        # Create the driver
        driver = webdriver.Chrome(service=service, options=options)
        driver.set_window_size(1280, 900)

        # Wait for browser to be ready
        time.sleep(2)
        
        start_url = f"https://www.instagram.com/{username}/reels/"
        logging.info(f"Opening page: {start_url}")
        driver.get(start_url)
        
        # Accept cookies/login prompts on initial page
        ActionChains(driver).move_by_offset(5, 5).perform()

        #handle_cookies_and_login_prompt(driver, delay)
        
        # Wait until reel anchors load
        try:
            WebDriverWait(driver, 15).until(
                EC.presence_of_element_located((By.XPATH, "//a[contains(@href, '/reel/')]"))
            )
        except TimeoutException:
            logging.error("No reels found on the page. Check if the username is correct or if Instagram requires login.")
            driver.quit()
            gl.stop()
            display.stop()
            return
        
        # Scroll and collect reel links
        logging.info(f"Collecting up to {count} reel links�")
        results = scroll_to_load_reels(driver, count, delay)
        total_views = sum(r.views for r in results)

        
        # Iterate through each reel link and extract details
    
        
        logging.info(f"Finished scraping {len(results)} reels. Total views: {total_views}")
        
        # Save results to file
        output_path = Path(output)
        save_results(results, total_views, output_path)
        logging.info(f"Results saved to {output_path.resolve()}")
        
        # Cleanup
        driver.quit()
        gl.stop()
        
    except Exception as e:
        logging.error(f"Error during scraping: {e}")
        try:
            gl.stop()
        except:
            pass
        raise

def main(argv: List[str] | None = None) -> None:
    parser = argparse.ArgumentParser(description="Scrape the latest Instagram reels of a public profile using GoLogin profiles.")
    parser.add_argument("--username", required=True, help="Instagram username (without @)")
    parser.add_argument("--reels", type=int, default=20, help="Number of recent reels to scrape (default: 20)")
    parser.add_argument("--output", required=True, help="Output file path (.json or .csv)")
    parser.add_argument("--profile-id", help="GoLogin profile ID (optional, uses hardcoded value if not provided)")
    parser.add_argument("--token", help="GoLogin API token (optional, uses hardcoded value if not provided)")
    parser.add_argument("--delay", type=float, default=0.0, help="Optional delay (seconds) between actions to mimic a human user")
    args = parser.parse_args(argv)
    
    if args.reels <= 0:
        parser.error("--reels must be a positive integer")
    
    run_scraper(args.username, args.reels, args.output, args.delay, args.profile_id, args.token)


if __name__ == "__main__":
    main()