#!/usr/bin/env python
#
# This file is part of ddr-cmdln/ddr
#
#  

description = """Checks collection repository for non-UTF-8 chars, lists offending files."""

epilog = """
Example:
    $ ddr-checkencoding git@mits.densho.org:ddr-test-123.git /var/www/media/base/temp

Clones collection repo to specified location, loads every .json file in the collection with strict utf-8 encoding,
then removes the directory.  This should surface any UTF-8 encoding problems.

"""

import argparse
import codecs
from datetime import datetime
import json
import logging
import os
import shutil
import sys

import chardet
import git

from DDR import models
from DDR import util


def out(verbose, text):
    if verbose:
        print(text)

def extract_collection_id(url):
    # git@mits.densho.org:REPO.git
    return os.path.splitext(url.split(':')[1])[0]

def clone(url, destpath):
    """Simple clone of repo (not ddr-clone).
    
    @param url: 
    @param destpath: 
    """
    return git.Repo.clone_from(url, destpath)

def clean(repo_path):
    """rm repo from filesystem
    
    @param repo_path: 
    """
    shutil.rmtree(repo_path)

def analyze_files(paths, verbose=False):
    """Opens files with strict encoding; lists paths that throw exceptions
    
    @param paths: list
    @param verbose: boolean
    @returns: list of defective paths
    """
    defects = []
    for path in paths:
        bad = 0
        try:
            with codecs.open(path, 'r', encoding='utf-8', errors='strict') as f:
                text = f.read()
        except:
            bad += 1
            defects.append(path)
            with open(path, 'r') as f:
                text = f.read()
            guess = chardet.detect(text)
            if verbose:
                print('\n| %s %s' % (path, guess))
        if (not bad) and verbose:
            sys.stdout.write('.')
    if len(paths) and verbose:
        print('')
    return defects


def main():

    parser = argparse.ArgumentParser(description=description, epilog=epilog,
                                     formatter_class=argparse.RawDescriptionHelpFormatter)
    parser.add_argument('repo_url', help='Repository URL.')
    parser.add_argument('dest_dir', help='Absolute path to destination dir.')
    parser.add_argument(
        '-v', '--verbose', action='store_const', const=1,
        help='Print lots of output. Important lines prefixed with "%%%%".'
    )
    parser.add_argument(
        '-c', '--csv', action='store_const', const=1,
        help='Print output in CSV-friendly form.'
    )
    parser.add_argument(
        '-H', '--headers', action='store_const', const=1,
        help='Print CSV headers (requires -c).'
    )
    parser.add_argument(
        '-j', '--json', action='store_const', const=1,
        help='Print output in JSON-friendly form.'
    )
    
    args = parser.parse_args()
    
    collection_id = extract_collection_id(args.repo_url)
    repo_path = args.dest_dir.replace('REPO', collection_id)
    out(args.verbose, collection_id)
    out(args.verbose, repo_path)
    
    # if verbose, add marker to important lines
    if args.verbose:
        prefix = '%% '
    else:
        prefix = ''
    
    if args.csv and args.headers:
        print('%scollection id, files, defects, elapsed' % prefix)
        
    start = datetime.now()
    out(args.verbose, start)
    
    out(args.verbose, 'clone %s %s' % (args.repo_url, repo_path))
    repo = clone(args.repo_url, repo_path)
    out(args.verbose, repo)
    
    out(args.verbose, 'analyzing')
    paths = util.find_meta_files(repo_path, recursive=True)
    defects = analyze_files(paths, args.verbose)
    
    out(args.verbose, 'cleaning up')
    clean(repo_path)
    
    end = datetime.now()
    elapsed = end - start
    out(args.verbose, end)
    
    if args.csv:
        print '%s%s' % (
            prefix,
            ','.join([
                str(collection_id), str(len(paths)), str(len(defects)), str(elapsed)
            ])
        )
    elif args.json:
        data = {
            'collection id': collection_id,
            'files': len(paths),
            'defects': len(defects),
            'elapsed': str(elapsed),
            }
        print '%s%s' % (
            prefix,
            json.dumps(data)
        )
    else:
        print('%s%s, %s files, %s bad, %s elapsed' % (
            prefix, collection_id, len(paths), len(defects), elapsed))


if __name__ == '__main__':
    main()
