#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
Deduplicate kan.py error logs and extract facility-code details.

Inputs
- backend/error_logs_kanzou_20250922.csv (utf-8-sig)

Outputs
- backend/error_logs_kanzou_20250922_dedup.csv
  - Dedupe by error case. Facility code errors keep per-facility details.
- backend/error_decimal_summary_20250922.csv
  - Unique decimal-related errors with counts and sample values.
"""

import csv
import io
import os
import re
import ast
from collections import defaultdict

SRC = os.path.join(os.path.dirname(__file__), 'error_logs_kanzou_20250922.csv')
OUT_DEDUP = os.path.join(os.path.dirname(__file__), 'error_logs_kanzou_20250922_dedup.csv')
OUT_DECIMAL = os.path.join(os.path.dirname(__file__), 'error_decimal_summary_20250922.csv')
OUT_MISSING_FAC = os.path.join(os.path.dirname(__file__), 'missing_facilities_20250922.txt')


def parse_column_order(sql: str):
    if not sql:
        return []
    # Extract between first '(' and ')'
    m = re.search(r"\(([^\)]*)\)", sql, re.S)
    if not m:
        return []
    cols = [c.strip() for c in m.group(1).split(',')]
    # Filter out empty/placeholder lines
    cols = [c for c in cols if c]
    return cols


def parse_input_values(values_str: str):
    if not values_str:
        return []
    try:
        return list(ast.literal_eval(values_str))
    except Exception:
        # Try to recover simple list patterns
        return []


def extract_sisetu_cd(sql: str, values_str: str):
    cols = parse_column_order(sql)
    if not cols:
        return None
    try:
        idx = [c.upper() for c in cols].index('SISETU_CD')
    except ValueError:
        return None
    vals = parse_input_values(values_str)
    if idx < len(vals):
        return vals[idx]
    return None


def main():
    if not os.path.exists(SRC):
        raise SystemExit(f"Source file not found: {SRC}")

    # Read as utf-8-sig
    with io.open(SRC, 'r', encoding='utf-8-sig', newline='') as f:
        reader = csv.DictReader(f)
        rows = list(reader)

    # Grouping stores
    dedup_general = {}  # key -> row sample + count
    dedup_facility = {}  # key -> row sample + count + sisetu_cd
    decimal_summary = {}  # key -> count + sample value

    missing_facilities = set()
    DEFAULT_CODES = {"19720024", "0000000", "9999999"}

    for r in rows:
        table = r.get('テーブル名', '')
        reg_id = r.get('移植登録ID', '')
        fac_name = r.get('施設名', '')
        col = r.get('カラム名', '')
        data = r.get('データ', '')
        err = r.get('エラー内容', '')
        sql = r.get('SQL', '')
        values = r.get('入力値', '')

        # Facility code specific handling
        is_sisetu_err = 'SISETU_CD' in (err or '') or (col or '').upper() == 'SISETU_CD'
        if is_sisetu_err:
            sisetu_cd = extract_sisetu_cd(sql or '', values or '')
            key = (table, 'SISETU_CD', err, str(sisetu_cd), fac_name)
            if key not in dedup_facility:
                dedup_facility[key] = {
                    'テーブル名': table,
                    'カラム名': 'SISETU_CD',
                    'エラー内容': err,
                    '施設コード': sisetu_cd,
                    '施設名': fac_name,
                    '発生件数': 1,
                    'サンプル移植登録ID': reg_id,
                }
            else:
                dedup_facility[key]['発生件数'] += 1
            # Track likely-missing facilities (default code used)
            if sisetu_cd and str(sisetu_cd) in DEFAULT_CODES:
                if fac_name:
                    missing_facilities.add(str(fac_name))
            continue

        # General dedupe: by (table, column, error)
        key = (table, col, err)
        if key not in dedup_general:
            dedup_general[key] = {
                'テーブル名': table,
                'カラム名': col,
                'エラー内容': err,
                'サンプルデータ': data,
                '発生件数': 1,
                '施設名(例)': fac_name,
                'サンプル移植登録ID': reg_id,
            }
        else:
            dedup_general[key]['発生件数'] += 1

        # Decimal error summary
        if 'Incorrect decimal value' in (err or '') or 'Data too long for column' in (err or ''):
            if col and any(tok in col for tok in ['PT', 'IgG', 'AMMONIA', 'GV/SLV', 'PIVKA', 'BMI']):
                dkey = (table, col)
                if dkey not in decimal_summary:
                    decimal_summary[dkey] = {
                        'テーブル名': table,
                        'カラム名': col,
                        'サンプル値': data,
                        '発生件数': 1,
                        '例の施設名': fac_name,
                    }
                else:
                    decimal_summary[dkey]['発生件数'] += 1

    # Write dedup CSV
    # Combined: facility rows first (preserve facility detail), then general rows
    dedup_rows = list(dedup_facility.values()) + list(dedup_general.values())
    dedup_fieldnames = [
        'テーブル名', 'カラム名', 'エラー内容',
        '施設コード', '施設名',
        'サンプルデータ', '発生件数', '施設名(例)', 'サンプル移植登録ID'
    ]
    with io.open(OUT_DEDUP, 'w', encoding='utf-8-sig', newline='') as f:
        writer = csv.DictWriter(f, fieldnames=dedup_fieldnames)
        writer.writeheader()
        for row in dedup_rows:
            writer.writerow(row)

    # Write decimal summary CSV
    dec_fields = ['テーブル名', 'カラム名', 'サンプル値', '発生件数', '例の施設名']
    with io.open(OUT_DECIMAL, 'w', encoding='utf-8-sig', newline='') as f:
        writer = csv.DictWriter(f, fieldnames=dec_fields)
        writer.writeheader()
        for row in decimal_summary.values():
            writer.writerow(row)

    print(f"Wrote dedup: {OUT_DEDUP}")
    print(f"Wrote decimal summary: {OUT_DECIMAL}")

    # Write missing facilities list (unique)
    if missing_facilities:
        with io.open(OUT_MISSING_FAC, 'w', encoding='utf-8') as f:
            f.write("施設マスタ未登録の可能性がある施設名一覧\n")
            for name in sorted(missing_facilities):
                f.write(name + "\n")
        print(f"Wrote missing facilities: {OUT_MISSING_FAC}")


if __name__ == '__main__':
    main()