#!/usr/bin/env python3
"""
Script untuk extract video data dari folder URL dan simpan ke CSV.
Format CSV: url_folder;nama_folder;title;url_video;url_thumbnail
"""

import sys
import csv
import re
import json
import logging
import argparse
from pathlib import Path
from datetime import datetime
from urllib.parse import urlparse, urljoin
import requests
from concurrent.futures import ThreadPoolExecutor, as_completed

# Try import curl_cffi untuk Cloudflare bypass
try:
    from curl_cffi import requests as cf_requests
    HAS_CF_REQUESTS = True
except ImportError:
    HAS_CF_REQUESTS = False
    cf_requests = None

# Setup logging - save to csv folder
csv_folder = Path('/root/server/csv')
csv_folder.mkdir(exist_ok=True)  # Create folder if not exists
log_filename = csv_folder / f'extract_folder_csv_{datetime.now().strftime("%Y%m%d_%H%M%S")}.log'
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler(log_filename),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger(__name__)

# Import dari download_upload_dc.py
import importlib.util
spec = importlib.util.spec_from_file_location("download_upload_dc", "/root/server/download_upload_dc.py")
dc_module = importlib.util.module_from_spec(spec)
spec.loader.exec_module(dc_module)

# Use existing functions dari download_upload_dc
extract_folder_videos = dc_module.extract_folder_videos
fetch_title_from_url = dc_module.fetch_title_from_url
detect_site = dc_module.detect_site
extract_video_url = dc_module.extract_video_url


def get_folder_name(folder_url):
    """Extract folder name dari URL"""
    parsed = urlparse(folder_url)
    path_parts = [p for p in parsed.path.strip('/').split('/') if p]
    
    # Format: /f/FOLDER_ID
    if len(path_parts) >= 2 and path_parts[0] == 'f':
        return path_parts[1]
    elif len(path_parts) >= 1:
        return path_parts[-1]
    
    return 'unknown'


def extract_subfolders_from_page(folder_url):
    """Extract subfolder links dan nama dari halaman folder"""
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
        }
        
        fetch_fn = cf_requests if HAS_CF_REQUESTS else requests
        fetch_kwargs = dict(headers=headers, timeout=30, allow_redirects=True)
        if HAS_CF_REQUESTS:
            fetch_kwargs['impersonate'] = 'chrome120'
        
        resp = fetch_fn.get(folder_url, **fetch_kwargs)
        if resp.status_code != 200:
            return []
        
        html = resp.text
        
        # Extract subfolder links dengan nama
        # Pattern yang bekerja untuk vid30s.com: href="/f/CODE" diikuti dengan angka folder
        pattern = r'href="(/f/[a-z0-9]+)".*?(\d+)<'
        matches = re.findall(pattern, html, re.DOTALL)
        
        # Build dict of folder_url -> folder_name
        subfolders = {}
        for folder_id, folder_name in matches:
            if folder_id and folder_name:
                parsed = urlparse(folder_url)
                subfolder_url = f"{parsed.scheme}://{parsed.netloc}{folder_id}"
                subfolder_name = folder_name.strip()
                
                if subfolder_url not in subfolders:
                    subfolders[subfolder_url] = subfolder_name
        
        logger.info(f"[subfolders] Found {len(subfolders)} subfolders")
        if subfolders:
            sample = list(subfolders.values())[:5]
            logger.info(f"[subfolders] Sample names: {sample}")
        
        return subfolders
    
    except Exception as e:
        logger.error(f"[subfolders] Error extracting subfolders: {e}")
        return {}


def extract_folder_videos_with_cf(folder_url, max_videos=None, is_recursive=False, subfolder_name=None, subfolder_url=None):
    """Extract folder videos dengan Cloudflare bypass
    
    Return: list of (video_url, subfolder_name, subfolder_url_value) tuples
    """
    logger.info(f"[folder-cf] Extracting videos dari folder (dengan CF bypass): {folder_url}")
    if subfolder_name:
        logger.info(f"[folder-cf] Subfolder name: {subfolder_name}")
    
    # Set subfolder_url if not provided (first call is main folder)
    if subfolder_url is None:
        subfolder_url = folder_url
    
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
        }
        
        # Try curl_cffi first untuk bypass Cloudflare
        fetch_fn = cf_requests if HAS_CF_REQUESTS else requests
        fetch_kwargs = dict(headers=headers, timeout=30, allow_redirects=True)
        if HAS_CF_REQUESTS:
            fetch_kwargs['impersonate'] = 'chrome120'
        
        resp = fetch_fn.get(folder_url, **fetch_kwargs)
        if resp.status_code != 200:
            logger.warning(f"[folder-cf] HTTP {resp.status_code}")
            return []
        
        html = resp.text
        video_urls = []
        
        # Extract dari berbagai pattern
        # Pattern 1: href="/v/CODE"
        links = re.findall(r'href=["\'](/v/[a-zA-Z0-9_\-]+)["\']', html)
        for link in links:
            full_url = urljoin(folder_url, link)
            if full_url not in [v[0] if isinstance(v, tuple) else v for v in video_urls]:
                video_urls.append((full_url, subfolder_name or 'main', subfolder_url))
        
        # Pattern 2: href="/d/CODE"
        links = re.findall(r'href=["\'](/d/[a-zA-Z0-9_\-]+)["\']', html)
        for link in links:
            full_url = urljoin(folder_url, link)
            if full_url not in [v[0] if isinstance(v, tuple) else v for v in video_urls]:
                video_urls.append((full_url, subfolder_name or 'main', subfolder_url))
        
        # Pattern 3: data-href="/v/CODE"
        links = re.findall(r'data-href=["\']([^"\']+)["\']', html)
        for link in links:
            full_url = urljoin(folder_url, link)
            if full_url not in [v[0] if isinstance(v, tuple) else v for v in video_urls]:
                video_urls.append((full_url, subfolder_name or 'main', subfolder_url))
        
        # Pattern 4: <a href="https://...">
        links = re.findall(r'<a[^>]+href=["\']([^"\']+)["\'"][^>]*>', html, re.IGNORECASE)
        for link in links:
            if '/v/' in link or '/d/' in link:
                full_url = urljoin(folder_url, link)
                if full_url not in [v[0] if isinstance(v, tuple) else v for v in video_urls]:
                    video_urls.append((full_url, subfolder_name or 'main', subfolder_url))
        
        # Check if this looks like a folder with subfolders (vid30s pattern)
        if not video_urls and not is_recursive:
            logger.info(f"[folder-cf] No direct videos found, checking for subfolders...")
            subfolders_dict = extract_subfolders_from_page(folder_url)
            
            if subfolders_dict:
                logger.info(f"[folder-cf] Found {len(subfolders_dict)} subfolders, extracting videos from each...")
                # Process all subfolders (removed 50-limit for complete extraction)
                max_subfolders = len(subfolders_dict)
                processed = 0
                for sub_folder_url, sub_folder_name in list(subfolders_dict.items())[:max_subfolders]:
                    try:
                        sub_videos = extract_folder_videos_with_cf(sub_folder_url, max_videos=None, is_recursive=True, subfolder_name=sub_folder_name, subfolder_url=sub_folder_url)
                        video_urls.extend(sub_videos)
                        processed += 1
                    except Exception as e:
                        logger.warning(f"[folder-cf] Error extracting from subfolder {sub_folder_url}: {e}")
                
                logger.info(f"[folder-cf] Processed {processed}/{max_subfolders} subfolders")
        
        # Ensure all video_urls are tuples (video_url, subfolder_name, subfolder_url)
        video_urls_with_metadata = []
        for item in video_urls:
            if isinstance(item, tuple) and len(item) == 3:
                video_urls_with_metadata.append(item)
            elif isinstance(item, tuple) and len(item) == 2:
                url, name = item
                video_urls_with_metadata.append((url, name, subfolder_url))
            else:
                # Plain string URL
                video_urls_with_metadata.append((item, subfolder_name or 'main', subfolder_url))
        
        video_urls = video_urls_with_metadata
        
        if max_videos:
            video_urls = video_urls[:max_videos]
        
        logger.info(f"[folder-cf] Total videos found: {len(video_urls)}")
        return video_urls
    
    except Exception as e:
        logger.error(f"[folder-cf] Error: {e}")
        return []


def extract_video_metadata(video_url, selected_columns=None):
    """Extract metadata dari video URL - hanya extract data yang diperlukan untuk selected columns"""
    if selected_columns is None:
        selected_columns = ['url_folder', 'nama_folder', 'url_subfolder', 'title', 'url_video', 'url_thumbnail']
    
    try:
        # Check if we need to extract any metadata at all
        needs_title = 'title' in selected_columns
        needs_video = 'url_video' in selected_columns
        needs_thumbnail = 'url_thumbnail' in selected_columns
        
        # If none of the metadata fields are needed, skip extraction
        if not (needs_title or needs_video or needs_thumbnail):
            logger.info(f"[extract] ⊘ Skipping extraction for {video_url} (not needed)")
            return {
                'title': '',
                'url_video': '',
                'url_thumbnail': '',
            }
        
        logger.info(f"[extract] Extracting metadata: title={needs_title}, video={needs_video}, thumb={needs_thumbnail}")
        
        # Extract only needed data
        site = None
        video_source_url = None
        thumbnail = None
        title = None
        
        if needs_video or needs_thumbnail:
            try:
                video_source_url, referer, thumbnail = extract_video_url(video_url)
            except Exception as e:
                logger.warning(f"[extract] Failed to extract video/thumbnail: {e}")
                video_source_url = None
                thumbnail = None
        
        if needs_title:
            try:
                title = fetch_title_from_url(video_url)
            except Exception as e:
                logger.warning(f"[extract] Failed to extract title: {e}")
                title = None
        
        logger.info(f"[extract] ✓ Completed - title={bool(title)}, video={bool(video_source_url)}, thumb={bool(thumbnail)}")
        
        return {
            'title': title or '',
            'url_video': video_url if needs_video else '',  # Store original video page URL
            'url_thumbnail': thumbnail or '',
        }
    
    except Exception as e:
        logger.error(f"[extract] Error: {e}")
        return {
            'title': '',
            'url_video': '',
            'url_thumbnail': '',
        }


def process_folder(folder_url, max_workers=3, selected_columns=None):
    """Process satu folder dan extract semua video metadata"""
    if selected_columns is None:
        selected_columns = ['url_folder', 'nama_folder', 'url_subfolder', 'title', 'url_video', 'url_thumbnail']
    
    logger.info(f"\n{'='*60}")
    logger.info(f"Processing folder: {folder_url}")
    logger.info(f"{'='*60}")
    
    folder_name = get_folder_name(folder_url)
    logger.info(f"Folder name: {folder_name}")
    
    # Extract video URLs dari folder (dengan CF bypass)
    video_data = extract_folder_videos_with_cf(folder_url, max_videos=None)
    
    if not video_data:
        logger.warning("No videos found in folder (trying fallback method)")
        video_urls = extract_folder_videos(folder_url, max_videos=None)
        video_data = [(url, folder_name, folder_url) for url in video_urls]
    
    # Normalize video_data - handle both tuple and string formats
    # Ensure all items are 3-element tuples: (url, subfolder_name, subfolder_url)
    normalized_video_data = []
    for item in video_data:
        if isinstance(item, tuple) and len(item) == 3:
            url, subfolder_name, subfolder_url = item
            normalized_video_data.append((url, subfolder_name, subfolder_url))
        elif isinstance(item, tuple) and len(item) == 2:
            url, subfolder_name = item
            normalized_video_data.append((url, subfolder_name, folder_url))
        else:
            # Plain URL string
            normalized_video_data.append((item, folder_name, folder_url))
    
    logger.info(f"Total videos ditemukan: {len(normalized_video_data)}")
    
    if not normalized_video_data:
        logger.warning("No videos found in folder (both methods)")
        return []
    
    # Extract metadata untuk setiap video (parallel)
    results = []
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = {executor.submit(extract_video_metadata, url, selected_columns): (url, subfolder_name, subfolder_url) for url, subfolder_name, subfolder_url in normalized_video_data}
        
        completed = 0
        for future in as_completed(futures):
            completed += 1
            video_url, subfolder_name, subfolder_url = futures[future]
            
            try:
                metadata = future.result()
                results.append({
                    'url_folder': folder_url,
                    'nama_folder': subfolder_name,  # Use subfolder name instead of parent folder ID
                    'url_subfolder': subfolder_url,  # Add subfolder URL
                    'url_video': video_url,
                    **metadata
                })
                logger.info(f"[{completed}/{len(normalized_video_data)}] ✓ Completed (subfolder: {subfolder_name})")
            except Exception as e:
                logger.error(f"[{completed}/{len(normalized_video_data)}] ✗ Error: {e}")
                results.append({
                    'url_folder': folder_url,
                    'nama_folder': subfolder_name,
                    'url_subfolder': subfolder_url,  # Add subfolder URL
                    'url_video': video_url,
                    'title': 'Error',
                    'url_thumbnail': '',
                })
    
    return results


def main():
    """Main function dengan command-line arguments"""
    # Parse command-line arguments
    parser = argparse.ArgumentParser(
        description='Extract video metadata dari folder URLs dan simpan ke CSV',
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog='''
Contoh penggunaan:
  python extract_folder_csv.py                              # Export semua kolom
  python extract_folder_csv.py --columns url_folder nama_folder url_subfolder  # Export 3 kolom
  python extract_folder_csv.py --columns url_folder title url_video            # Custom columns
        '''
    )
    parser.add_argument(
        '--columns', 
        nargs='+',
        default=['url_folder', 'nama_folder', 'url_subfolder', 'title', 'url_video', 'url_thumbnail'],
        help='Kolom CSV yang ingin diexport (default: semua kolom)'
    )
    
    args = parser.parse_args()
    selected_columns = args.columns
    
    logger.info(f"Selected columns: {selected_columns}")
    
    # Read folder URLs
    link_file = Path('/root/server/link_folder.txt')
    if not link_file.exists():
        logger.error(f"File not found: {link_file}")
        return
    
    folder_urls = []
    with open(link_file, 'r') as f:
        for line in f:
            line = line.strip()
            if line and not line.startswith('#'):
                folder_urls.append(line)
    
    logger.info(f"Total folder URLs: {len(folder_urls)}")
    if HAS_CF_REQUESTS:
        logger.info("✓ curl_cffi available for Cloudflare bypass")
    else:
        logger.warning("⚠ curl_cffi NOT available (install: pip install curl_cffi)")
    
    # Output CSV file - save to csv folder
    csv_folder = Path('/root/server/csv')
    csv_folder.mkdir(exist_ok=True)  # Create folder if not exists
    csv_filename = csv_folder / f'folder_videos_{datetime.now().strftime("%Y%m%d_%H%M%S")}.csv'
    
    # Process all folders
    all_results = []
    for i, folder_url in enumerate(folder_urls, 1):
        logger.info(f"\n[{i}/{len(folder_urls)}] Processing folder...")
        try:
            results = process_folder(folder_url, max_workers=2, selected_columns=selected_columns)
            all_results.extend(results)
        except Exception as e:
            logger.error(f"Error processing {folder_url}: {e}")
    
    # Save to CSV
    logger.info(f"\n{'='*60}")
    logger.info(f"Saving {len(all_results)} videos to CSV...")
    logger.info(f"CSV file: {csv_filename}")
    logger.info(f"{'='*60}")
    
    if all_results:
        # Filter fieldnames based on selected columns
        all_fieldnames = ['url_folder', 'nama_folder', 'url_subfolder', 'title', 'url_video', 'url_thumbnail']
        fieldnames = [col for col in all_fieldnames if col in selected_columns]
        
        logger.info(f"Fieldnames to export: {fieldnames}")
        
        with open(csv_filename, 'w', newline='', encoding='utf-8') as csvfile:
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames, delimiter=';', extrasaction='ignore')
            writer.writeheader()
            writer.writerows(all_results)
        
        logger.info(f"✓ CSV saved: {csv_filename}")
        logger.info(f"Total rows: {len(all_results)}")
        logger.info(f"Columns: {fieldnames}")
    else:
        logger.warning("No results to save")
        logger.info("\n⚠ Keterangan:")
        logger.info("  Folder vidvf.com mungkin:")
        logger.info("  1. Kosong (belum ada video)")
        logger.info("  2. Memerlukan JavaScript rendering (perlu Playwright)")
        logger.info("  3. Memerlukan authentication")
        logger.info("\n  Mari coba dengan Playwright browser...")
        
        # Try dengan Playwright
        try:
            from playwright.sync_api import sync_playwright
            logger.info("\n✓ Playwright available, trying headless browser...")
            
            for folder_url in folder_urls:
                try_playwright_extraction(folder_url, csv_filename, selected_columns)
        except ImportError:
            logger.warning("Playwright tidak tersedia (pip install playwright)")


def try_playwright_extraction(folder_url, csv_filename, selected_columns=None):
    """Try extract dengan Playwright headless browser"""
    if selected_columns is None:
        selected_columns = ['url_folder', 'nama_folder', 'url_subfolder', 'title', 'url_video', 'url_thumbnail']
    
    logger.info(f"[playwright] Processing: {folder_url}")
    logger.info(f"[playwright] Selected columns: {selected_columns}")
    
    try:
        from playwright.sync_api import sync_playwright
        
        with sync_playwright() as p:
            browser = p.chromium.launch(headless=True)
            context = browser.new_context(
                user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
            )
            page = context.new_page()
            
            logger.info(f"[playwright] Loading page...")
            page.goto(folder_url, wait_until='networkidle', timeout=30000)
            page.wait_for_timeout(3000)
            
            # Get page content
            content = page.content()
            
            # Extract video links
            video_urls = []
            
            # Pattern 1: <a href="/v/...">
            links = re.findall(r'href=["\'](/v/[a-zA-Z0-9_\-]+)["\']', content)
            for link in links:
                full_url = urljoin(folder_url, link)
                if full_url not in video_urls:
                    video_urls.append(full_url)
            
            # Pattern 2: <a href="/d/...">
            links = re.findall(r'href=["\'](/d/[a-zA-Z0-9_\-]+)["\']', content)
            for link in links:
                full_url = urljoin(folder_url, link)
                if full_url not in video_urls:
                    video_urls.append(full_url)
            
            logger.info(f"[playwright] Found {len(video_urls)} videos")
            
            # Extract metadata
            all_results = []
            folder_name = get_folder_name(folder_url)
            
            for idx, video_url in enumerate(video_urls, 1):
                logger.info(f"[playwright] [{idx}/{len(video_urls)}] Extracting {video_url}...")
                metadata = extract_video_metadata(video_url)
                all_results.append({
                    'url_folder': folder_url,
                    'nama_folder': folder_name,
                    'url_subfolder': folder_url,  # Use folder_url as subfolder_url
                    'url_video': video_url,
                    **metadata
                })
            
            # Save to CSV
            if all_results:
                all_fieldnames = ['url_folder', 'nama_folder', 'url_subfolder', 'title', 'url_video', 'url_thumbnail']
                fieldnames = [col for col in all_fieldnames if col in selected_columns]
                
                with open(csv_filename, 'a', newline='', encoding='utf-8') as csvfile:
                    writer = csv.DictWriter(csvfile, fieldnames=fieldnames, delimiter=';', extrasaction='ignore')
                    # Hanya tulis header jika file kosong
                    if csvfile.tell() == 0:
                        writer.writeheader()
                    writer.writerows(all_results)
                
                logger.info(f"[playwright] ✓ Added {len(all_results)} rows to {csv_filename}")
                logger.info(f"[playwright] Columns: {fieldnames}")
            
            browser.close()
    
    except Exception as e:
        logger.error(f"[playwright] Error: {e}")
        import traceback
        logger.debug(traceback.format_exc())


if __name__ == '__main__':
    main()
