/* Handles 23andMe, Strategene (maybe all diagnomics files).  TBD Self Decode */

import {TRACE_INGEST} from "../Constants";

const INGEST_VERSION_NO = "v0.1"

const MAX_LINES_TO_SEARCH_FOR_HEADERS = 30

const stripGsaPrefixSelfDecode = (token) => token.replace(/^GSA-rs/, "rs");

const GENOME_FILE_STRATEGIES = [
    {name: "23andMe",
        first_line_pattern: /^# This data file generated by 23andMe/,   // TODO DPB merge and make first_line_pattern a disjunction
        headers_line_pattern: /^# rsid\t/,
        column_separator: "\t",
        expected_tokens_per_row: 4,
        row_ingester: (tokens) => [/*rsId*/ tokens[0], /*genome pair*/ tokens[3], /*errors*/undefined]
    },
    {name: "Strategene or Ixxx (Diagnomics)",
        first_line_pattern: /^#This file was generated by Diagnomics/,
        headers_line_pattern: /^rsID\t/,
        column_separator: "\t",
        expected_tokens_per_row: 4,
        row_ingester: (tokens) => [/*rsId*/ tokens[0], /*genome pair*/ tokens[3], /*errors*/undefined]
    },
    {name: "Ancestry",
        first_line_pattern: /^#AncestryDNA raw data download/,
        headers_line_pattern: /rsid\tchromosome\tposition\tallele1\tallele2/,
        column_separator: "\t",
        expected_tokens_per_row: 5,
        row_ingester: (tokens) => [/*rsId*/ tokens[0], /*genome pair*/ tokens[3] + tokens[4], /*errors*/undefined]
    },
    {name: "SelfDecode",
        first_line_pattern: /^# Generated by SelfDecode/,  // TODO DPB merge and make first_line_pattern a disjunction
        headers_line_pattern: /rsid\tchromosome\tposition\tgenotype/,
        column_separator: "\t",
        expected_tokens_per_row: 4,
        row_ingester: (tokens) => [/*rsId*/ stripGsaPrefixSelfDecode(tokens[0]), /*genome pair*/ tokens[3], /*errors*/undefined]
        // NOTE: some rsIds are prefixed with "gsa-"
    },
    {name: "PLINK",
        first_line_pattern: /^# This data file generated by PLINK/,
        headers_line_pattern: /rsid\tchromosome\tposition\tgenotype/,
        column_separator: "\t",
        expected_tokens_per_row: 4,
        row_ingester: (tokens) => [/*rsId*/ stripGsaPrefixSelfDecode(tokens[0]), /*genome pair*/ tokens[3], /*errors*/undefined]
        // NOTE: some rsIds are prefixed with "gsa-"
    }
]

function ingestGenomeData(genomeDataFileString, traceToConsole = TRACE_INGEST) {
    let commentLines = []
    let errors = []
    let warnings = []

    //const csvHeader = string.slice(0, string.indexOf("\n")).split("\t");
    // TODO DPB maybe 635,000 rows, so reconsider cost of storing the split vs. parse-as-you-go (computers are more than they were)
    // For that, could split while reading or even parse while reading or even use the data and not keep around at all while reading
    const tsvRows = genomeDataFileString.split("\n");

    const tsvRowsSize = tsvRows.length;

    if (tsvRowsSize < 2) {
        return [undefined, ["Not a recognized genome data file - only zero or one lines of data ( " + INGEST_VERSION_NO + ")"], []]
    }

    // Skip lines to the first one that starts with "# rsid" and treat that as the header
    let nextLineNumber = 0
    let firstLine = tsvRows[nextLineNumber++].trim()
    commentLines.push(firstLine)

    const strategy = GENOME_FILE_STRATEGIES.find((tryMe) => tryMe.first_line_pattern.test(firstLine))
    if (!strategy) {
        let error = "Not a recognized genome data file - failed to identify first line in file. (Line '" + firstLine + "'; " + INGEST_VERSION_NO + ")"
        console.log(error)
        return [undefined, [error], []]
    }

    let headerLine = undefined
    let headerLineCandidate = tsvRows[nextLineNumber++].trim()
    while (!headerLine && nextLineNumber < MAX_LINES_TO_SEARCH_FOR_HEADERS) {
        if (strategy.headers_line_pattern.test(headerLineCandidate)) {
            headerLine = headerLineCandidate
        } else {
            commentLines.push(headerLineCandidate)
            headerLineCandidate = tsvRows[nextLineNumber++].trim()
        }
    }
    if (!headerLine) {
        let error = "Not a recognized genome data file - failed to find column headers line within the first " + MAX_LINES_TO_SEARCH_FOR_HEADERS + " lines. (First line " + firstLine + "; " + INGEST_VERSION_NO + ")"
        console.log(commentLines)
        return [undefined, [error], []]
    }
    // TODO DPB if file started with a bunch of lines starting with hash mark, then take the first line with rsNNNN as data
    /*if (commentLine !== "# rsid\tchromosome\tposition\tgenotype" && commentLine !== "rsID\tCHROM\tPOS\tGENOTYPE" && commentLine !== "rsID CHROM POS GENOTYPE") {
        let warn = "Not a recognized genome data file -- failed to match column headers line, trying anyway. (First line " + firstLine + ";" + VERSION_NO + ")"
        console.log(warn)
        warnings.push(warn)
    }*/

    let genomeDict = {}
    let wrongNumberOfTokensRowsCount = 0
    let firstWrongNumberOfTokensRow = -1
    let duplicateRsIdsCount = 0
    let duplicateRsIds = []
    let ignoredGenomes = {} // a string keying a count

    // TODO DPB handle last row (or empty or starts-with-hash rows)

    while (nextLineNumber < tsvRowsSize) {
        let dataLine = tsvRows[nextLineNumber++]
        let tokens = dataLine.split(strategy.column_separator)
        if (tokens.length === strategy.expected_tokens_per_row) {

            let [rsIdRaw, genotype, errors] = strategy.row_ingester(tokens)
            let rsId = rsIdRaw.trim().toLowerCase() // TODO DPB lower case might not be correct action for non rsNNN rows
            if (genomeDict[rsId]) {
                duplicateRsIdsCount++
                duplicateRsIds.push(rsId)
            } else {
                let rsIdGenotype = genotype.trim().toUpperCase()
                // record other than [ACGT][ACGT] into ignoredGenomes
                if (/^[ACGT][ACGT]$/.test(rsIdGenotype)) {
                    genomeDict[rsId] = rsIdGenotype
                } else {
                    let newCount = ignoredGenomes[rsIdGenotype]
                    ignoredGenomes[rsIdGenotype] = newCount ? newCount + 1 : 1
                }
            }

        } else {
            if (nextLineNumber !== tsvRows.length || tokens.length !== 1 || tokens[0] !== "") {
                console.log("Line with unusual tokens count...")
                console.log(tokens)
                if (wrongNumberOfTokensRowsCount === 0) {
                    firstWrongNumberOfTokensRow = nextLineNumber - 1
                }
                wrongNumberOfTokensRowsCount++
            }
        }
    }
    if (wrongNumberOfTokensRowsCount > 0) {
        let warn = "Data file had " + wrongNumberOfTokensRowsCount.toString() + " wrong-number-of-tabs rows.  Ignoring that row and proceeding anyway (first such line " + firstWrongNumberOfTokensRow + "; " + INGEST_VERSION_NO + ".)"
        warnings.push(warn)
    }
    if (duplicateRsIdsCount > 0) {
        let warn = "Data file had " + duplicateRsIdsCount.toString() + " duplicate-rsId rows. Ignoring those rows and proceeding anyway " + INGEST_VERSION_NO
        console.log(warn)
        console.log(duplicateRsIds)
        warnings.push(warn)
    }
    if (ignoredGenomes) {
        let warn = "Data file has SNP encodings that are not pairs of A, C, G, T..."
        console.log(warn)
        console.log(ignoredGenomes)
        // This is common and user need not know.  warnings.push(warn)
    }
    if (traceToConsole) {
        let keys = Object.keys(genomeDict)
        console.log("Count of genes in data: " + keys.length)
        console.log(keys[0])
        console.log(genomeDict[keys[0]])
        let oneRsId = "rs7775228"
        console.log(oneRsId)
        console.log(genomeDict[oneRsId])
    }
    return [genomeDict, errors, warnings]
}

export default ingestGenomeData;