o
    ch+;                     @  s  d Z ddlmZ ddlZddlZddlZddlZddlZddlZddl	Z	ddl
Z
ddlmZ ddlmZ ddlmZ ddlmZmZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZ ddlmZ ddlm Z! ddl"m#Z#m$Z$ ddl%m&Z& ddl'm(Z( ddl)m*Z* ej+ej,dd eG dd dZ-d;ddZ.d<d=d#d$Z/d<d>d'd(Z0d?d-d.Z1d@dAd4d5Z2dBdCd8d9Z3e4d:kre3  dS dS )Da  
Instagram Reels Scraper
=======================

This script scrapes view statistics from the latest Reels of a public
Instagram profile.  It relies exclusively on browser automation using
Selenium and does **not** call any official or unofficial APIs.  The
script scrolls through the account's Reels page, opens each reel in a
new tab, extracts the number of views, the publication date, and the
unique slug/ID from the URL, then writes the collected data to a JSON
or CSV file.  A running total of views across the inspected reels is
also reported.

Usage (from the command line)::

    python scrape_instagram_reels.py --username sinkmate.de --reels 20 --output output.json --profile-id 687e41cb8cfee375657ad010 --token your_token

Options:
    --username     Instagram username to scrape (required)
    --reels        Number of latest reels to process (default: 20)
    --output       Output filename, .json or .csv (required)
    --profile-id   GoLogin profile ID (required)
    --token        GoLogin API token (required)
    --delay        Delay in seconds between actions to mimic human behaviour (optional)

Requirements:
    - Python 3.8+
    - Selenium >= 4.0
    - GoLogin Python SDK
    - webdriver-manager

Notes:
    * Uses GoLogin profiles for browser fingerprint management
    * No Instagram account is required to run this script; it uses only
      publicly available data.
    * Web scraping can break if Instagram updates their HTML or CSS.
    )annotationsN)	dataclass)datetime)Path)ListOptional)GoLogin)	webdriver)By)Keys)Service)WebDriverWait)expected_conditions)NoSuchElementExceptionTimeoutException)ActionChains)ChromeDriverManager)Displayz[%(levelname)s] %(message)s)levelformatc                   @  s<   e Zd ZU dZded< ded< ded< ded< dddZdS )ReelInfoz8Container for information about a single Instagram reel.strurlintviewszOptional[str]dateslugreturndictc                 C  s   | j | j| j| jdS )Nr   r   r   idr   r   r   r   )self r#   //var/www/html/scraper/scrape_instagram_reels.pyto_dictT   s
   zReelInfo.to_dictN)r   r   )__name__
__module____qualname____doc____annotations__r%   r#   r#   r#   r$   r   K   s   
 r   textr   r   r   c                 C  s   |  dd dd }td|}|sdS | \}}| dd dd}zt|}W n
 ty6   Y dS w d	}|rL| }|d
v rFd}n|dv rLd}t|| S )u  Convert a view count string (e.g. '12,3�Tsd. Aufrufe', '1.2M views') to an integer.

    The view counts on Instagram can be displayed in different locales and with
    abbreviations.  This function normalises the number into an integer.

    Parameters
    ----------
    text: str
        Raw text containing the view count and possibly a label ("Aufrufe", "views").

    Returns
    -------
    int
        Parsed integer view count.  Returns 0 if parsing fails.
    u         z([\d.,]+)\s*([KMk]|Tsd\.|M|Mn)?r   . ,   )ktsdi  )mmni@B )	replacestripresearchgroupsfloat
ValueErrorlowerr   )r+   cleanedmatch
number_strunitvalue
multiplierr#   r#   r$   parse_view_count]   s&   rD           driverwebdriver.Remotedelayr;   Nonec           	   
   C  sB  t jdft jdft jdft jdft jdfg}|D ]*\}}zt| dt||f}|  |r5t| W  n t	t
fyB   Y qw z2| t jd}| t jd}| t jd	}|d
 |d | d| |rst| W dS W dS  t	t
fy   z| t jdtj |rt| W Y dS W Y dS    Y Y dS w )zAttempt to close cookie banner and login pop-up if present.

    Parameters
    ----------
    driver: webdriver.Remote
        An instance of the Selenium WebDriver.
    delay: float
        Optional sleep delay to slow down actions.
    u   //button[contains(translate(text(),'ABCDEFGHIJKLMNOPQRSTUVWXYZ���','abcdefghijklmnopqrstuvwxyz���'), 'allow') and contains(text(), 'Alle')]u~   //button[contains(translate(text(),'ABCDEFGHIJKLMNOPQRSTUVWXYZ���','abcdefghijklmnopqrstuvwxyz���'), 'zustimmen')]u   //button[contains(translate(text(),'ABCDEFGHIJKLMNOPQRSTUVWXYZ���','abcdefghijklmnopqrstuvwxyz���'), 'akzeptieren')]u   //button[contains(translate(text(),'ABCDEFGHIJKLMNOPQRSTUVWXYZ���','abcdefghijklmnopqrstuvwxyz���'), 'optionale cookies ablehnen')]z///button[contains(text(), 'Allow all cookies')]   z2//*[@id="loginForm"]/div[1]/div[1]/div/label/inputz2//*[@id="loginForm"]/div[1]/div[2]/div/label/inputz)//*[@id="loginForm"]/div[1]/div[3]/buttonzsinkmate.devzB9!n4g64zarguments[0].click();bodyN)r
   XPATHr   untilECelement_to_be_clickableclicktimesleepr   r   find_element	send_keysexecute_scriptTAG_NAMEr   ESCAPE)	rF   rH   cookie_selectorsbyselectorbuttonusername_fieldpassword_field	login_btnr#   r#   r$   handle_cookies_and_login_prompt   sF   


r_   reels_countList[ReelInfo]c              	   C  s  g }t  }d}d}t||k r||k r| tjd}|D ]}|d}	|	r)|	|v r*q||	 zE|tjd}
|
j}t	|}|dkrst
d|	 d| d | d	| d
 td |tjd}
|
j}t	|}t
d| d W n ty   t
d|	 d d}Y nw |	ddd }|t|	|d|d t||kr nq| d td |d7 }t||k r||k s|d| S )zLScrolls and collects reels with URLs and view counts directly from the grid.r      //a[contains(@href, '/reel/')]hrefzu.//div/div/div/span/span[contains(@class, 'x1vvkbs') and contains(@class, 'xdj266r') and contains(@class, 'x14z9mp')]u   Views-Element f�r Reel z1 konnte nicht korrekt ausgewertet werden. Text: ''error_screenshot_z.pngr1   u   Nächster Versuch: 'u   Views-Element nicht f�r Reel z) gefunden. Vermutlich noch nicht geladen./Nr!   z'window.scrollBy(0, window.innerHeight);   )setlenfind_elementsr
   rL   get_attributeaddrS   r+   rD   loggingwarningsave_screenshotrQ   rR   r   rstripsplitappendr   rU   )rF   r`   rH   	collected	seen_urlsattemptsmax_attemptsreelsar   span
views_textr   r   r#   r#   r$   scroll_to_load_reels   sL   




'r}   resultstotal_viewsoutput_pathr   c           	      C  s  |j  dkr6dd | D }||d}|jddd}tj||dd	d
 W d   dS 1 s/w   Y  dS |j  dkr{|jdddd,}tj|g dd}|  | D ]	}||	  qU|d|ddd W d   dS 1 stw   Y  dS t
d|j  )a=  Save the list of reel information to JSON or CSV.

    Parameters
    ----------
    results: List[ReelInfo]
        List of extracted reel information.
    total_views: int
        Sum of all views across the processed reels.
    output_path: Path
        Destination file.  The extension determines the format.
    z.jsonc                 S  s   g | ]}|  qS r#   )r%   .0rr#   r#   r$   
<listcomp>   s    z save_results.<locals>.<listcomp>)ry   r   wzutf-8)encodingFrJ   )ensure_asciiindentNz.csvr/   )newliner   r   )
fieldnamesTOTALzUnsupported output format: )suffixr=   openjsondumpcsv
DictWriterwriteheaderwriterowr%   r<   )	r~   r   r   datapayloadfcsvfilewriterr   r#   r#   r$   save_results   s"   ""r   usernamecountoutput
profile_idtokenc              
   C  s  |pd}|pd}t ||g dd}tddd}	|	  z| }
| }tt|d }t }|	d	|
 |
d
 |
d |
d tj||d}|dd td d|  d}td|  || t|dd  zt|dttjdf W n ty   td |  |  |	  Y W dS w td| d t|||}t dd |D }tdt!| d|  t"|}t#||| td|$   |  |  W dS  t%y } ztd |  z|  W     Y  d}~ww )!a  Main entry point to run the reel scraper with GoLogin.

    Parameters
    ----------
    username: str
        Instagram account username (without '@').
    count: int
        Number of latest reels to process.
    output: str
        Path to output file (.json or .csv).
    delay: float
        Optional delay between actions to avoid detection.
    profile_id: str
        GoLogin profile ID (optional, uses hardcoded value if not provided).
    token: str
        GoLogin API token (optional, uses hardcoded value if not provided).
    685054d57cb66888bd4a4e6fzeyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiI2ODQzMzQ5MjIwMDMwYjQ0OTdlZmVmN2YiLCJ0eXBlIjoiZGV2Iiwiand0aWQiOiI2ODQ0NDA0MDYyNjQ0YWU1NzQ5ODI2MWMifQ.vj-RJv3gm0KpGnb0fAtmipDS2pDnLAFlDik8QKpsc8A)z--no-sandboxz--disable-dev-shm-usagez--enable-unsafe-swiftshaderz--disable-gpuz--disable-audio-output)r   r   extra_paramsr   )i  i8  )visiblesize)driver_versiondebuggerAddressz--lang=en-US,en;q=0.9z---disable-blink-features=AutomationControlledz--disable-notifications)serviceoptionsi   i  rJ   zhttps://www.instagram.com/z/reels/zOpening page:       rc   z\No reels found on the page. Check if the username is correct or if Instagram requires login.NzCollecting up to u    reel links�c                 s  s    | ]}|j V  qd S N)r   r   r#   r#   r$   	<genexpr>^  s    zrun_scraper.<locals>.<genexpr>zFinished scraping z reels. Total views: zResults saved to zError during scraping: )&r   r   startget_chromium_versionr   r   installr	   ChromeOptionsadd_experimental_optionadd_argumentChromeset_window_sizerQ   rR   ro   infogetr   move_by_offsetperformr   rM   rN   presence_of_element_locatedr
   rL   r   errorquitstopr}   sumrk   r   r   resolve	Exception)r   r   r   rH   r   r   
PROFILE_IDTOKENgldisplaydebugger_addresschromium_versionr   r   rF   	start_urlr~   r   r   er#   r#   r$   run_scraper  sr   







r   argvList[str] | Nonec                 C  s   t jdd}|jdddd |jdtdd	d
 |jdddd |jddd |jddd |jdtddd
 || }|jdkrE|d t|j	|j|j
|j|j|j d S )NzMScrape the latest Instagram reels of a public profile using GoLogin profiles.)descriptionz
--usernameTzInstagram username (without @))requiredhelpz--reels   z.Number of recent reels to scrape (default: 20))typedefaultr   z--outputz Output file path (.json or .csv)z--profile-idzCGoLogin profile ID (optional, uses hardcoded value if not provided))r   z--tokenzBGoLogin API token (optional, uses hardcoded value if not provided)z--delayrE   z>Optional delay (seconds) between actions to mimic a human userr   z"--reels must be a positive integer)argparseArgumentParserr   r   r;   
parse_argsry   r   r   r   r   rH   r   r   )r   parserargsr#   r#   r$   mainw  s   


"r   __main__)r+   r   r   r   )rE   )rF   rG   rH   r;   r   rI   )rF   rG   r`   r   rH   r;   r   ra   )r~   ra   r   r   r   r   r   rI   )rE   NN)r   r   r   r   r   r   rH   r;   r   r   r   r   r   rI   r   )r   r   r   rI   )5r)   
__future__r   r   
contextlibr   r   ro   r8   sysrQ   dataclassesr   r   pathlibr   typingr   r   gologinr   seleniumr	   selenium.webdriver.common.byr
   selenium.webdriver.common.keysr   !selenium.webdriver.chrome.servicer   selenium.webdriver.support.uir   selenium.webdriver.supportr   rN   selenium.common.exceptionsr   r   selenium.webdriverr   webdriver_manager.chromer   pyvirtualdisplayr   basicConfigINFOr   rD   r_   r}   r   r   r   r&   r#   r#   r#   r$   <module>   sP   &
(3
4 k
