"""
Script kéo dữ liệu thông số kỹ thuật từ globalfastener.com
Sử dụng: python scrape_fastener.py

Bảng dữ liệu trên website dùng format TRANSPOSE:
- Hàng = tên thông số (d1 min, d1 max, d2, b, s, h, k, r, weight...)
- Cột = kích thước (Φ3, Φ4, M8, M10...)
- Dữ liệu chia thành nhiều cặp bảng (label + values)
"""

import requests
from bs4 import BeautifulSoup
import json
import time
import re
import sys
import os
from urllib.parse import urljoin

# Fix encoding cho Windows console
os.environ["PYTHONIOENCODING"] = "utf-8"
if sys.stdout.encoding != "utf-8":
    sys.stdout.reconfigure(encoding="utf-8")
    sys.stderr.reconfigure(encoding="utf-8")

BASE_URL = "https://www.globalfastener.com"

CATEGORIES = {
    "hex_bolts": {"sort": 6, "name": "Hex Bolts & Screws (Bu long luc giac)"},
    "hex_structural_bolts": {"sort": 124, "name": "Hex Structural Bolts"},
    "thread_rods": {"sort": 113, "name": "Thread Rods (Ty ren)"},
    "hex_nuts": {"sort": 25, "name": "Hex Nuts (Dai oc luc giac)"},
    "locking_nuts": {"sort": 27, "name": "Locking Nuts (Dai oc chong xoay)"},
    "machine_screws": {"sort": 38, "name": "Machine Screws (Vit may)"},
    "hex_socket_screws": {"sort": 39, "name": "Hex Socket Screws (Vit luc giac chim)"},
    "tapping_screws": {"sort": 40, "name": "Tapping Screws (Vit tu khoan)"},
    "plain_washers": {"sort": 54, "name": "Plain Washers (Long den phang)"},
    "spring_washers": {"sort": 55, "name": "Spring Washers (Long den venh)"},
    "disc_spring": {"sort": 146, "name": "Disc Spring (Long den dia)"},
}

HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
}


def get_standards_list(sort_id):
    """Lay danh sach tieu chuan tu trang danh muc"""
    url = f"{BASE_URL}/standards/?sort={sort_id}"
    print(f"  Loading: {url}")

    resp = requests.get(url, headers=HEADERS, timeout=30)
    resp.raise_for_status()
    soup = BeautifulSoup(resp.text, "html.parser")

    standards = []
    for link in soup.find_all("a", href=re.compile(r"detail\.php\?sid=")):
        href = link.get("href", "")
        text = link.get_text(strip=True)
        # Bo qua "View Suppliers" va cac link khong phai tieu chuan
        if not text or "sid=" not in href:
            continue
        if text.lower() in ("view suppliers", "view", "suppliers"):
            continue

        full_url = urljoin(BASE_URL + "/standards/", href)
        standards.append({"name": text, "url": full_url})

    # Loai bo trung lap
    seen = set()
    unique = []
    for s in standards:
        if s["url"] not in seen:
            seen.add(s["url"])
            unique.append(s)

    return unique


def parse_transposed_tables(soup):
    """
    Parse bang kich thuoc dang transpose tu globalfastener.com

    Pattern: moi trang co N nhom bang, moi nhom gom 3 bang lien tiep:
      [SIZE_TABLE] -> [LABEL_TABLE] -> [VALUE_TABLE]

    SIZE_TABLE: row 0 co "Size"/"Nominal Diameter" + nhieu cot kich thuoc
    LABEL_TABLE: 1-2 cot, nhieu hang, ten thong so (d1 min, d2 max, b, s...)
    VALUE_TABLE: nhieu cot = so kich thuoc, nhieu hang = so thong so
    """
    tables = soup.find_all("table")
    if not tables:
        return []

    # Phan loai tung bang
    table_info = []  # (index, type, num_rows, num_cols)

    for ti, table in enumerate(tables):
        rows = table.find_all("tr")
        if not rows:
            table_info.append((ti, "empty", 0, 0))
            continue

        first_cells = rows[0].find_all(["td", "th"])
        first_text = first_cells[0].get_text(strip=True) if first_cells else ""
        ncols = len(first_cells)
        nrows = len(rows)

        # SIZE table: cell dau = "Size" hoac "Nominal Diameter" hoac tuong tu
        if ncols > 5 and any(kw in first_text for kw in ("Size", "Nominal Diameter")):
            table_info.append((ti, "size", nrows, ncols))
        # LABEL table: 1-3 cot, nhieu hang, cell dau la ten thong so
        elif ncols <= 3 and nrows >= 5:
            table_info.append((ti, "label", nrows, ncols))
        # VALUE table: nhieu cot, nhieu hang, cell dau la so
        elif ncols >= 5 and nrows >= 3:
            try:
                float(first_text.replace(",", ".").replace("(", "").replace(")", ""))
                table_info.append((ti, "value", nrows, ncols))
            except (ValueError, AttributeError):
                table_info.append((ti, "other", nrows, ncols))
        else:
            table_info.append((ti, "other", nrows, ncols))

    # Tim cac nhom: SIZE -> LABEL -> VALUE
    all_sizes_data = {}

    for i, (ti, ttype, nrows, ncols) in enumerate(table_info):
        if ttype != "size":
            continue

        size_table = tables[ti]

        # Extract sizes tu row 1 (row 0 co cot gop, row 1 co cot rieng)
        sizes = []
        size_rows = size_table.find_all("tr")
        # Thu row 1 truoc (thuong co cac size rieng le)
        for row_idx in [1, 0]:
            if row_idx >= len(size_rows):
                continue
            row = size_rows[row_idx]
            cells = row.find_all(["td", "th"])
            for cell in cells:
                text = cell.get_text(strip=True)
                # Bo qua header text
                if not text or any(kw in text for kw in ("Size", "Nominal", "Diameter")):
                    continue
                # Bo qua cac cot gop (nhieu size gop lai)
                if len(text) > 20:
                    continue
                # Kiem tra co phai size khong: "Φ8", "M10", "3", "(42)", etc
                cleaned = text.replace("(", "").replace(")", "")
                if re.match(r'^[ΦMd]?[\d.]+$', cleaned):
                    if text not in sizes:
                        sizes.append(text)

            if sizes:
                break

        if not sizes:
            continue

        # Tim LABEL table va VALUE table ke tiep
        label_idx = None
        value_idx = None
        for j in range(i + 1, len(table_info)):
            tj, tjtype, tjrows, tjcols = table_info[j]
            if tjtype == "label" and label_idx is None:
                label_idx = tj
            elif tjtype == "value" and value_idx is None:
                value_idx = tj
                break
            elif tjtype == "size":
                break  # Da gap nhom moi

        if value_idx is None:
            continue

        # Build param names tu LABEL table
        param_names = []
        if label_idx is not None:
            label_table = tables[label_idx]
            current_prefix = ""
            for row in label_table.find_all("tr"):
                cells = row.find_all(["td", "th"])
                parts = [c.get_text(strip=True) for c in cells]

                if len(parts) == 2:
                    # First cell = param group (d1, b, s, h...), second = sub (min, max, Nominal Size)
                    current_prefix = parts[0]
                    sub = parts[1]
                    param_names.append(f"{current_prefix} {sub}")
                elif len(parts) == 1:
                    text = parts[0]
                    if text in ("min", "max", "Nominal Size"):
                        param_names.append(f"{current_prefix} {text}")
                    else:
                        # Standalone param (k, r, per 1000 units, For Nominal Thread Diameter)
                        param_names.append(text)
                        current_prefix = text

        # Fallback param names
        value_table = tables[value_idx]
        value_rows = value_table.find_all("tr")

        if len(param_names) != len(value_rows):
            param_names = [f"param_{idx}" for idx in range(len(value_rows))]

        # Map values vao sizes
        for row_idx, value_row in enumerate(value_rows):
            cells = value_row.find_all(["td", "th"])
            values = [c.get_text(strip=True) for c in cells]
            param = param_names[row_idx] if row_idx < len(param_names) else f"param_{row_idx}"

            for col_idx, val in enumerate(values):
                if col_idx < len(sizes):
                    size_name = sizes[col_idx]
                    if size_name not in all_sizes_data:
                        all_sizes_data[size_name] = {"size": size_name}
                    all_sizes_data[size_name][param] = val

    return list(all_sizes_data.values())


def parse_standard_table(soup):
    """
    Parse dung dinh dang bang cua globalfastener.
    Ket hop ca transposed va normal table parsing.
    """
    # Thu transposed truoc
    data = parse_transposed_tables(soup)
    if data:
        return data

    # Fallback: normal table parsing
    tables = soup.find_all("table")
    all_data = []

    for table in tables:
        rows = table.find_all("tr")
        if len(rows) < 3:
            continue

        # Tim header
        headers = []
        for cell in rows[0].find_all(["td", "th"]):
            headers.append(cell.get_text(strip=True))

        if len(headers) < 2:
            continue

        for row in rows[1:]:
            cells = row.find_all(["td", "th"])
            if len(cells) < 2:
                continue
            row_data = {}
            for i, cell in enumerate(cells):
                key = headers[i] if i < len(headers) else f"col_{i}"
                row_data[key] = cell.get_text(strip=True)
            all_data.append(row_data)

    return all_data


def scrape_standard_detail(url, standard_name):
    """Scrape chi tiet thong so ky thuat cua 1 tieu chuan"""
    print(f"    Scraping: {standard_name}")

    try:
        resp = requests.get(url, headers=HEADERS, timeout=30)
        resp.raise_for_status()
    except requests.RequestException as e:
        print(f"    ! Error loading {url}: {e}")
        return None

    soup = BeautifulSoup(resp.text, "html.parser")

    # Lay mo ta
    description = ""
    for tag in soup.find_all(["h1", "h2", "h3"]):
        text = tag.get_text(strip=True)
        if text and len(text) > 5 and "Click" not in text:
            description = text
            break

    # Parse bang
    dimensions = parse_standard_table(soup)

    return {
        "standard": standard_name.strip(),
        "description": description,
        "url": url,
        "dimensions": dimensions,
        "count": len(dimensions)
    }


def scrape_category(category_key):
    """Scrape toan bo 1 danh muc"""
    cat = CATEGORIES[category_key]
    print(f"\n{'='*60}")
    print(f"  Category: {cat['name']}")
    print(f"{'='*60}")

    standards = get_standards_list(cat["sort"])
    print(f"  Found {len(standards)} standards")

    results = []
    for std in standards:
        data = scrape_standard_detail(std["url"], std["name"])
        if data and data["dimensions"]:
            results.append(data)
            print(f"    OK {std['name']}: {len(data['dimensions'])} sizes")
        else:
            print(f"    EMPTY {std['name']}: no table data")
        time.sleep(1.5)

    return {
        "category": cat["name"],
        "sort_id": cat["sort"],
        "standards": results,
        "total_standards": len(results)
    }


def scrape_multiple(category_keys):
    """Scrape nhieu danh muc"""
    all_data = {}
    for key in category_keys:
        if key not in CATEGORIES:
            print(f"! Unknown category '{key}'. Valid:")
            for k, v in CATEGORIES.items():
                print(f"  - {k}: {v['name']}")
            continue
        all_data[key] = scrape_category(key)
    return all_data


def save_data(data, filename="fastener_specs.json"):
    """Luu du lieu ra JSON"""
    filepath = f"d:/12/{filename}"
    with open(filepath, "w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent=2)
    print(f"\nSaved: {filepath}")

    # Dem tong so ban ghi
    total = 0
    for cat_data in data.values():
        for std in cat_data.get("standards", []):
            total += len(std.get("dimensions", []))
    print(f"Total records: {total}")

    return filepath


def lookup(data, query):
    """
    Tra cuu thong so theo ma san pham
    Vi du: lookup(data, "DIN 127 M8") hoac lookup(data, "DIN 127 8")
    """
    query_upper = query.upper().strip()

    # Tach standard name va size
    # Ho tro: "DIN 127 M8", "DIN 127 8", "DIN 127 Φ8", "DIN 127 PHI8"
    size_match = re.search(r'[MΦ]?(\d+\.?\d*)\s*$', query_upper)
    target_size_num = None
    standard_query = query_upper

    if size_match:
        target_size_num = size_match.group(1)
        standard_query = query_upper[:size_match.start()].strip()

    # Chuan hoa standard query: "DIN 127" -> match "DIN  127"
    # Xoa khoang trang thua
    standard_query_normalized = re.sub(r'\s+', ' ', standard_query)

    results = []
    for cat_key, cat_data in data.items():
        for std in cat_data.get("standards", []):
            std_name_normalized = re.sub(r'\s+', ' ', std["standard"].upper())

            if standard_query_normalized and standard_query_normalized not in std_name_normalized:
                continue

            if target_size_num and std["dimensions"]:
                for dim in std["dimensions"]:
                    # Lay size va normalize: "Φ8" -> "8", "M10" -> "10"
                    raw_size = dim.get("size", "")
                    size_clean = re.sub(r'^[ΦMd]', '', raw_size).strip()

                    # Cung check "For Nominal Thread Diameter"
                    thread_dia = dim.get("For Nominal Thread Diameter", "").strip()

                    if (size_clean == target_size_num or
                        thread_dia == target_size_num):

                        results.append({
                            "standard": std["standard"],
                            "description": std["description"],
                            "size": raw_size,
                            "thread_diameter": thread_dia,
                            "specs": {k: v for k, v in dim.items()
                                      if k not in ("size", "For Nominal Thread Diameter")}
                        })
            elif not target_size_num:
                results.append({
                    "standard": std["standard"],
                    "description": std["description"],
                    "total_sizes": len(std["dimensions"]),
                    "available_sizes": [d.get("size", "?") for d in std["dimensions"][:10]]
                })

    return results


def print_results(results):
    """In ket qua tra cuu"""
    if not results:
        print("  Not found.")
        return

    for r in results:
        print(f"\n  {r['standard']}")
        if r.get("description"):
            print(f"  {r['description']}")

        if "specs" in r:
            print(f"  Size: {r['size']}")
            for key, val in r["specs"].items():
                print(f"    {key}: {val}")
        elif "available_sizes" in r:
            print(f"  Total: {r['total_sizes']} sizes")
            print(f"  Ex: {', '.join(r['available_sizes'])}...")


# =====================================================
if __name__ == "__main__":
    print("GLOBAL FASTENER SPEC SCRAPER")
    print("=" * 40)
    print()
    print("Available categories:")
    for key, val in CATEGORIES.items():
        print(f"  {key:25s} -> {val['name']}")
    print()

    # ===== CONFIG: chon danh muc muon scrape =====
    categories_to_scrape = [
        "spring_washers",      # Long den venh
        # "plain_washers",     # Long den phang
        # "hex_bolts",         # Bu long luc giac
        # "hex_nuts",          # Dai oc
        # "hex_socket_screws", # Vit luc giac chim
    ]

    data = scrape_multiple(categories_to_scrape)
    save_data(data)

    # Test lookup
    print("\n" + "=" * 40)
    print("TEST LOOKUP")
    print("=" * 40)

    test_queries = ["DIN 127 8", "DIN 127 10", "DIN 128 12"]
    for q in test_queries:
        print(f"\nQuery: {q}")
        results = lookup(data, q)
        print_results(results)