# Copyright (c) 2014,
# Mathias Kuhring, KuhringM@rki.de, Robert Koch Institute, Germany, 
# All rights reserved. For details, please note the license.txt.

# Function for parameter parsing, file checks and preparation, 
# package loading and error handling


loadPackages <- function(names, quietly=TRUE){
  # Tries to require R packages/libraries and stop executions if failing
  #
  # Args:
  #   names: vector with package names
  #
  # Return:
  #   Nothing, but will stop execution if packages are missing
  
  check <- vector(mode="logical", length=length(names))
  
  for (i in 1:length(names)){
    suppressMessages(check[i] <- require(names[i], 
                                         quietly=quietly, 
                                         character.only=TRUE))
  }
  
  if (!all(check)){
    complainAndStop("missing R-package(s)", names[!check])
  }
}


# surankco-feature argument parser
parseSurankcoFeature <- function(args=commandArgs(trailingOnly = TRUE)){
  # Parses parameters for the surankco feature module
  # 
  # Args:
  #   args: commandline parameters, default = commandArgs(trailingOnly = TRUE) 
  #
  # Returns:
  #   List of parameter values as provided by parse_args (optparse package)
  
  parser.options <- list(
    # Input parameter
    make_option(opt_str=c("-a", "--assemblies"),
                action="store",
                help="Indicate a list of assembly files (comma separated), correct 
                suffixes are mandatory (e.g. \".ace\")"),
    
    make_option(opt_str=c("-d", "--directory"),
                action="store",
                help="Indicate a directory containing assembly files with indicated 
                format (default: ace, see parameter -f)"),
    
    make_option(opt_str=c("-f", "--assembly.format"),
                action="store",
                default="ace",
                help="Indicate assembly/contig format (resp. suffix), 
                either \"ace\" (default) or \"contigs.fasta\""),
    
    make_option(opt_str=c("-r", "--read.quality.format"),
                action="store",
                default="no value",
                help="Indicate the read quality format: qual, qua or fastq for ACE 
                (default=\"qual\") resp. sam or bam for contigs.fasta (default=\"sam\")"),
    
    make_option(opt_str=c("-q", "--fastq.version"),
                action="store",
                default="illumina18",
                help="Indicate the fastq version: auto, sanger, solexa, illumina13, 
                illumina15, illumina18 (default). Only needed for ACE assemblies."),
    
    # Read name regex
    make_option(opt_str=c("-s", "--split.regex"),
                action="store",
                #default=".[0-9]*-",
                default="nosplit",
                type="character",
                help="Indicate a regular expression to cutoff read names (e.g. if 
                modified by the assembler). Only needed for ACE assemblies. 
                Note, if a backslash \"\\\" is needed use \"\\\\\\\\\"!"),
    
    #     make_option(opt_str=c("-n", "--name.regex"),
    #                 action="store",
    #           help="Indicate a regular expression to match read names
    #                 (e.g. if modified by the assembler)"),
    
    # Performance parameter
    make_option(opt_str=c("-t", "--threads"),
                action="store",
                default=1,
                type="integer",
                help="Indicate a number of cores or threads to use. Might speed up 
                some parallelized operations (default: 1)"),
    
    make_option(opt_str=c("-m", "--memory"),
                action="store",
                default=32,
                type="double",
                help="Indicate the maximum memory usage (in Gb) of Javas virtual 
                machine (default: 32). Try to increase if big data sets report 
                heap space problems."),
    
    # Feature parameter
    make_option(opt_str=c("-k", "--kmer.features"),
                action="store_true",
                default=FALSE,
                help="Indicates whether k-mer features should be computed 
                (experimental, very long runtime) or not (default)"),
    
    make_option(opt_str=c("-g", "--expected.genome.size"),
                action="store",
                default="0",
                type="character",
                help="Indicate a list of expected genome sizes (comma separated) or 
                one value for all assemblies. Default is 0, which will estimate 
                the genome sizes as sum of contig lengths."),
    
    make_option(opt_str=c("-c", "--contig.size.filter"),
                action="store",
                default="0",
                type="integer",
                help="Indicate a minimum contig size. Default: 0")
  )
  
  myparser <- OptionParser(usage="usage: surankco-feature [options]",
                           option_list=parser.options,
                           prog = NULL, description = "", epilogue = "")
  
  parameters <- parse_args(object = myparser,
                           args = args,
                           print_help_and_exit = TRUE,
                           positional_arguments = FALSE)
  
  
  # check obligatory parameter
  checkObligatoryParameters(parameters, need.xor=c("assemblies", "directory"))
  
  # check assembly.format
  if (!(parameters$assembly.format %in% c("ace", "contigs.fasta"))){
    complainAndStop(paste("invalid \"assembly.format\" (resp. suffix)", 
                          "(expecting ace or contigs.fasta)"),
                    parameters$assembly.format)
  }
  
  # check read.quality.format
  if (parameters$assembly.format == "ace"){
    if (parameters$read.quality.format == "no value") parameters$read.quality.format <- "qual"
    else if (!(parameters$read.quality.format %in% c("qual", "qua", "fastq"))){
      complainAndStop(paste("invalid \"read.quality.format\"", 
                            "(expecting qual, qua, fastq)"),
                      parameters$read.quality.format)
    }
  }
  if (parameters$assembly.format == "contigs.fasta"){
    if (parameters$read.quality.format == "no value") parameters$read.quality.format <- "sam"
    else if (!(parameters$read.quality.format %in% c("sam", "bam"))){
      complainAndStop(paste("invalid \"read.quality.format\"", 
                            "(expecting sam or bam)"),
                      parameters$read.quality.format)
    }
  }
  
  # check files
  parameters$files <- prepareFiles(parameters, "assemblies",
                                   parameters$assembly.format,
                                   parameters$read.quality.format)
  
  # parameter checks
  # fastq.version
  if (!(parameters$fastq.version %in% c("auto", "sanger", "solexa",
                                        "illumina13", "illumina15", 
                                        "illumina18"))){
    complainAndStop(paste("invalid \"fastq.version\"", 
                          "(expecting auto, sanger, solexa, illumina13, ", 
                          "illumina15 or illumina18)"),
                    parameters$fastq.version)
  }
  
  # threads
  if (is.na(parameters$threads) || parameters$threads < 1){
    complainAndStop(paste("invalid thread number", 
                          "(expecting a positiv integer)"),
                    parameters$threads)
  }
  parameters$threads <- min(parameters$threads, nrow(parameters$files))
  
  # memory
  if (is.na(parameters$memory) || parameters$memory <= 0){
    complainAndStop(paste("invalid memory value ", 
                          "(expecting a positiv number)"),
                    parameters$memory)
  }
  
  # expected.genome.size
  splits <- as.numeric(strsplit(parameters$expected.genome.size, ",")[[1]])
  if (any(is.na(splits)) || any(floor(splits)!=splits) || any(splits < 0)){
    complainAndStop(paste("invalid values of \"expected.genome.size\"", 
                          "(expecting positive integers only)"), splits)
  }
  if (length(parameters$expected.genome.size) > 1){
    if (nrow(parameters$files) != length(parameters$expected.genome.size)){
      complainAndStop(paste("number of expected.genome.size values and assembly", 
                            "files not matching"),
                      paste(length(parameters$expected.genome.size), "vs",
                            nrow(parameters$files)))
    }
  }
  parameters$expected.genome.size <- splits
  
  # contig.size.filter
  if (is.na(parameters$contig.size.filter) || parameters$contig.size.filter < 0){
    complainAndStop(paste("invalid minimum contig size", 
                          "(expecting a positiv integer >= 0)"),
                    parameters$contig.size.filter)
  }
  
  return(parameters)
}


# surankco-score argument parser
parseSurankcoScore <- function(args = commandArgs(trailingOnly = TRUE)){
  # Parses parameters for the surankco score module
  # 
  # Args:
  #   args: commandline parameters, default = commandArgs(trailingOnly = TRUE) 
  #
  # Returns:
  #   List of parameter values as provided by parse_args (optparse package)
  
  parser.options <- list(
    # Input parameter
    make_option(opt_str=c("-a", "--assemblies"),
                action="store",
                help="Indicate a list of assembly files (comma separated), correct 
                suffixes are mandatory (default \"*.contigs.fasta\")"),
    
    make_option(opt_str=c("-d", "--directory"),
                action="store",
                help="Indicate a directory containing assembly files with indicated 
                format (e.g. \"*.contigs.fasta\")"),
    
    make_option(opt_str=c("-f", "--assembly.suffix"),
                action="store",
                default="contigs.fasta",
                help="Indicate assembly format/suffix, default=\"contigs.fasta\""),
    
    make_option(opt_str=c("-r", "--reference.suffix"),
                action="store",
                default="ref.fasta",
                help="Indicate the reference format/suffix, default=\"ref.fasta\""),
    
    make_option(opt_str=c("-p", "--pdf.histograms"),
                action="store",
                default="surankco_score_histograms.pdf",
                help="Indicate a name for the score histogram pdf  
                (default = \"surankco_score_histograms.pdf\")"),
    
    make_option(opt_str=c("-m", "--memory"),
                action="store",
                default=32,
                type="integer",
                help="Indicate the maximum memory usage (in Gb) of Javas virtual 
                machine (default: 32). Try to increase if big data sets report 
                heap space problems.")
  )
  
  myparser <- OptionParser(usage="usage: surankco-score [options]",
                           option_list=parser.options,
                           prog = NULL, description = "", epilogue = "")
  
  parameters <- parse_args(object = myparser,
                           args = args,
                           print_help_and_exit = TRUE,
                           positional_arguments = FALSE)
  
  
  # check obligatory parameter
  checkObligatoryParameters(parameters, need.xor=c("assemblies", "directory"))
  
  # check files
  parameters$files <- prepareFiles(parameters, "assemblies",
                                   parameters$assembly.suffix,
                                   parameters$reference.suffix)
  
  # parameter checks
  # pdf.histograms
  if (!file.exists(dirname(parameters$pdf.histograms))){
    complainAndStop("can not find path for \"pdf.histograms\"",
                    parameters$pdf.histograms)
  }
  
  # memory
  if (is.na(parameters$memory) || parameters$memory <= 0){
    complainAndStop(paste("invalid memory value ", 
                          "(expecting a positiv number)"),
                    parameters$memory)
  }
  
  return(parameters)
}


# surankco-training argument parser
parseSurankcoTraining <- function(args = commandArgs(trailingOnly = TRUE)){
  # Parses parameters for the surankco training module
  # 
  # Args:
  #   args: commandline parameters, default = commandArgs(trailingOnly = TRUE) 
  #
  # Returns:
  #   List of parameter values as provided by parse_args (optparse package)
  
  parser.options <- list(
    # Input parameter
    make_option(opt_str=c("-f", "--features"),
                action="store",
                help="Indicate a list of surankco feature files (comma separated), 
                correct suffixes are mandatory (\"*.features.txt\")"),
    
    make_option(opt_str=c("-d", "--directory"),
                action="store",
                help="Indicate a directory containing surankco feature files 
                (\"*.features.txt\")"),
    
    make_option(opt_str=c("-o", "--output.filename"),
                action="store",
                default="surankco_rfs.RData",
                help="Indicate a name for the export of the random forest classifiers 
                (default = \"surankco_rfs.RData\")"),
    
    make_option(opt_str=c("-e", "--exponential.quantile"),
                action="store",
                default=0.25,
                help="Indicate an exponential distribution quantile to divide scores 
                (default: 0.25)"),
    
    make_option(opt_str=c("-m", "--manual.thresholds"),
                action="store",
                help="Indicate manual thresholds instead of quantiles, one per score 
                each (comma separated) in the same order as in the score file")
  )
  
  myparser <- OptionParser(usage="usage: surankco-training [options]",
                           option_list=parser.options,
                           prog = NULL, description = "", epilogue = "")
  
  parameters <- parse_args(object = myparser,
                           args = args,
                           print_help_and_exit = TRUE,
                           positional_arguments = FALSE)
  
  
  # check (obligatory) parameters
  checkObligatoryParameters(parameters, need.xor=c("features", "directory"))
  
  # check files
  parameters$features.suffix <- "features.txt"
  parameters$scores.suffix <- "scores.txt"
  parameters$files <- prepareFiles(parameters, "features",
                                   parameters$features.suffix,
                                   parameters$scores.suffix)
  
  # parameter checks
  # output.filename
  if (!file.exists(dirname(parameters$output.filename))){
    complainAndStop("can not find path for \"output.filename\"",
                    parameters$output.filename)
  }
  
  # exponential.quantile
  if (is.na(parameters$exponential.quantile) ||
        parameters$exponential.quantile <= 0 || 
        parameters$exponential.quantile >= 1){
    complainAndStop("\"exponential.quantile\" out of range (0,1)",
                    parameters$exponential.quantile)
  }
  
  # manual.thresholds
  if ("manual.thresholds" %in% names(parameters)){
    splits <- as.numeric(strsplit(parameters$manual.thresholds, ",")[[1]])
    in.range <- 0 <= splits[-c(5,7)] & splits[-c(5,7)] <= 1 
    in.range <- append(in.range, 0 <= splits[c(5,7)] & splits[c(5,7)] <= 100)
    if (length(splits) != 8){
      complainAndStop("wrong number of \"manual.thresholds\" (expecting 8)", splits)
    }
    if (is.na(splits) || !all(in.range)){
      complainAndStop(paste0("out of range \"manual.thresholds\" (expecting ", 
                             "[0,1] resp. [0,100] for threshold 5 and 7)"), splits)
    }
    parameters$manual.thresholds <- splits
  }
  
  return(parameters)
}


# surankco-prediction argument parser
parseSurankcoPrediction <- function(args = commandArgs(trailingOnly = TRUE)){
  # Parses parameters for the surankco prediction module
  # 
  # Args:
  #   args: commandline parameters, default = commandArgs(trailingOnly = TRUE) 
  #
  # Returns:
  #   List of parameter values as provided by parse_args (optparse package)
  
  parser.options <- list(
    # Input parameter
    make_option(opt_str=c("-f", "--features"),
                action="store",
                help="Indicate one surankco feature file, a correct suffixe is 
                mandatory (\"*.features.txt\")"),
    
    #     make_option(opt_str=c("-d", "--directory"),
    #                 action="store",
    #                 help="Indicate a directory containing surankco feature files 
    #                 (\"*.features.txt\")"),
    
    make_option(opt_str=c("-r", "--random.forests"),
                action="store",
                help="Indicate a surankco random forests file (\"*.RData\")"),
    
    make_option(opt_str=c("-o", "--output.filename"),
                action="store",
                default="surankco_results.txt",
                help="Indicate a name for the final results file")
  )
  
  myparser <- OptionParser(usage="usage: surankco-prediction [options]",
                           option_list=parser.options,
                           prog = NULL, description = "", epilogue = "")
  
  parameters <- parse_args(object = myparser,
                           args = args,
                           print_help_and_exit = TRUE,
                           positional_arguments = FALSE)
  
  
  # check obligatory parameter
  checkObligatoryParameters(parameters, need.all=c("features", "random.forests"))
  
  # check files
  parameters$features.suffix <- "features.txt"
  parameters$rf.suffix <- "RData"
  parameters$files.features <- prepareFiles(parameters, "features", parameters$features.suffix)
  parameters$files.rf <- prepareFiles(parameters, "random.forests", parameters$rf.suffix)
  parameters$files.features <- parameters$files.features$features.txt
  parameters$files.rf <- parameters$files.rf$RData
  
  # parameter checks
  # output.filename
  if (!file.exists(dirname(parameters$output.filename))){
    complainAndStop("can not find path for \"output.filename\"",
                    parameters$output.filename)
  }
  
  return(parameters)
}


checkObligatoryParameters <- function(values, need.all=NULL, need.xor=NULL){
  # Checks whether all obligatory parameters were indicated
  #
  # Args:
  #   values:   list of parameter values as provided 
  #             by parse_args (optparse package)
  #   need.all: vector of obligatory parameters
  #   need.xor: vector of parameters where only one is needed (exclusive)
  #
  # Returns:
  #   Nothing, but will stop execution if obligatory parameters are missing
  
  if (!is.null(need.all)){
    if (!all(test <- need.all %in% names(values))){
      complainAndStop("missing parameter(s)", need.all[!test])
    }
  }
  
  if (!is.null(need.xor)){
    if (sum(test <- need.xor %in% names(values)) > 1){
      complainAndStop("conflicting parameters", need.xor[test])
    }
    if (sum(test <- need.xor %in% names(values)) < 1){
      complainAndStop("missing parameters (either)", need.xor)
    }
  }
}


prepareFiles <- function(parameters, file.parameter, 
                         first.suffix, additional.suffixes=NULL){
  # prepares a list of files from either a directory or a filelist
  
  if ("directory" %in% names(parameters)){
    filelist <- paste(sep="", parameters$directory, "/", 
                      list.files(path=parameters$directory,
                                 pattern=paste(first.suffix, "$", sep="")))
  }
  else{
    filelist <- strsplit(parameters[[file.parameter]], split=",", fixed=TRUE)[[1]]
  }
  
  files <- checkFiles(filelist, first.suffix, additional.suffixes)
  
  return(files)
}


checkFiles <- function(filelist, first.suffix, additional.suffixes=NULL){
  # Checks if files have the expected suffix and if files exists,
  # includes files with same prefix but different suffix
  # 
  # Args:
  #   filelist:             
  #   first.suffix:         
  #   additional.suffixes:  
  #
  # Returns:
  #   A data.frame containing filename columns per suffix or
  #   will stop execution if files are missing
  
  suffix <- paste(".", first.suffix, "$", sep="")
  
  # check first.suffix 
  missing.suffix <- !grepl(pattern=suffix, x=filelist)
  if (any(missing.suffix)){
    complainAndStop(paste("missing suffix -", first.suffix, 
                          "- for file(s)", sep=""),
                    filelist[missing.suffix], 
                    item.start=":\n\t",
                    item.sep="\n\t")
  }
  
  prefix <- sub(pattern=suffix, replacement="", x=filelist)
  files <- data.frame(filelist, stringsAsFactors=FALSE)
  colnames(files) <- first.suffix
  rownames(files) <- prefix
  
  # build filenames from additional.suffixes
  if (!is.null(additional.suffixes)){
    for (add.suf in additional.suffixes){
      files[add.suf] <- paste(prefix, add.suf, sep=".")
    }
  }
  
  # check file existence
  missing.files <- !file.exists(unlist(files))
  if (any(missing.files)){
    complainAndStop("missing file(s)",
                    unlist(files)[missing.files],
                    item.start=":\n\t",
                    item.sep="\n\t")
  }
  
  return(files)
}


complainAndStop <- function(message, items=NULL, 
                            item.start=": ", item.sep=", ", item.end=""){
  # Prints an error message and stops the programm using stop()
  #
  # Args:
  #   message:    error message to print
  #   items:      list of items to add to the message (e.g. missing files)
  #   item.start: separator between message and items (e.g. ": ")
  #   item.sep:   separator for the item list (e.g. ", ", "\n\t")
  
  if (is.null(items)){
    stop(message, call.=FALSE)
  }
  else{
    items <- paste(items, collapse=item.sep)
    stop(paste(message, items, sep=item.start), call.=FALSE)
  } 
}