#!/usr/bin/env ruby
#
# bio-vcf parser and transformer
# Author:: Pjotr Prins
# License:: MIT
#
# Copyright (C) 2014-2020 Pjotr Prins <pjotr.prins@thebird.nl>

USAGE = "Vcf parser"

gempath = File.dirname(File.dirname(__FILE__))
$: << File.join(gempath,'lib')

VERSION_FILENAME=File.join(gempath,'VERSION')
version = File.new(VERSION_FILENAME).read.chomp

require 'bio-vcf'
require 'bio-vcf/pcows'
require 'optparse'
require 'timeout'
require 'fileutils'

# Uncomment when using the bio-logger
# require 'bio-logger'
# log = Bio::Log::LoggerPlus.new 'vcf'
# log.outputters = Bio::Log::Outputter.stderr
# Bio::Log::CLI.logger('stderr')
# Bio::Log::CLI.trace('info')

options = { show_help: false, source: 'https://github.com/pjotrp/bioruby-vcf', version: version+' (Pjotr Prins)', date: Time.now.to_s, thread_lines: 40_000, timeout: 180 }
opts = OptionParser.new do |o|
  o.banner = "Usage: #{File.basename($0)} [options] filename\ne.g.  #{File.basename($0)} < test/data/input/somaticsniper.vcf"

  o.on('-i','--ignore-missing', 'Ignore missing data') do
    options[:ignore_missing] = true
  end
  o.on('--filter cmd',String, 'Evaluate filter on each record') do |cmd|
    options[:filter] = cmd
  end

  o.on('--sfilter cmd',String, 'Evaluate filter on each sample') do |cmd|
    options[:sfilter] = cmd
  end
  o.on("--sfilter-samples list", Array, "Filter on selected samples (e.g., 0,1") do |l|
    options[:sfilter_samples] = l
  end

  o.on('--ifilter cmd','--if cmd',String, 'Include filter') do |cmd|
    options[:ifilter] = cmd
  end
  o.on("--ifilter-samples list", Array, "Include set - implicitely defines exclude set") do |l|
    options[:ifilter_samples] = l
  end

  o.on('--efilter cmd','--ef cmd',String, 'Exclude filter') do |cmd|
    options[:efilter] = cmd
  end
  o.on("--efilter-samples list", Array, "Exclude set - overrides exclude set") do |l|
    options[:efilter_samples] = l
  end
  o.on('--add-filter name',String, 'Set/add filter field to name') do |name|
    options[:add_filter] = name
  end

  o.on("--bed bedfile", String, "Filter on BED elements") do |bed|
    options[:bed] = bed
  end

  o.on('-e cmd', '--eval cmd',String, 'Evaluate command on each record') do |cmd|
    options[:eval] = cmd
  end
  o.on('--eval-once cmd',String, 'Evaluate command once (usually for header info)') do |cmd|
    options[:eval_once] = true
    options[:eval] = cmd
    # options[:num_threads] = 1
    # options[:thread_lines] = 1
    options[:skip_header] = true
  end
  o.on('--seval cmd',String, 'Evaluate command on each sample') do |cmd|
    options[:seval] = cmd
    options[:skip_header] = true
  end
  o.on("--rewrite eval", "Rewrite INFO") do |s|
    options[:rewrite] = s
  end
  o.on("--samples list", Array, "Output selected samples") do |l|
    options[:samples] = l
  end
  o.on("--rdf", "Generate Turtle RDF (also check out --template!)") do |b|
    require 'bio-vcf/vcfrdf'
    options[:rdf] = true
    options[:skip_header] = true
  end
  o.on("--num-threads [num]", Integer, "Multi-core version (default ALL)") do |i|
    options[:num_threads] = i
  end
  o.on("--thread-lines num", Integer, "Fork thread on num lines (default #{options[:thread_lines]})") do |i|
    options[:thread_lines] = i
  end
  o.on_tail("--id name", String, "Identifier") do |s|
    options[:id] = s
  end
  o.on_tail("--tags list", String, "Add tags") do |s|
    options[:tags] = s
  end

  o.on("--skip-header", "Do not output VCF header info") do
    options[:skip_header] = true
  end

  o.on("--set-header list", Array, "Set a special tab delimited output header (#samples expands to sample names)") do |list|
    options[:set_header] = list
    options[:skip_header] = true
  end

  o.on("-t erb","--template erb",String, "Use ERB template for output") do |s|
    require 'bio-vcf/vcfrdf'
    require 'erb'
    options[:template] = s
    options[:skip_header] = true
  end

  o.on("--add-header-tag", "Add bio-vcf status tag to header output") do |t|
    options[:tag] = true
  end

  o.on("--timeout [num]", Integer, "Timeout waiting for thread to complete (default #{options[:timeout]})") do |i|
    options[:timeout] = i
  end

  # Uncomment the following when using the bio-logger
  # o.separator ""
  # o.on("--logger filename",String,"Log to file (default stderr)") do | name |
  #   Bio::Log::CLI.logger(name)
  # end
  #
  # o.on("--trace options",String,"Set log level (default INFO, see bio-logger)") do | s |
  #   Bio::Log::CLI.trace(s)
  # end
  #
  o.on("--names", "Output sample names") do |q|
    options[:quiet] = true
    options[:num_threads] = nil
    options[:eval_once] = true
    options[:eval] = 'header.samples.join("\t")'
    # options[:num_threads] = 1
    # options[:thread_lines] = 1
    options[:skip_header] = true
  end
  o.on("--statistics", "Output statistics") do |q|
    options[:statistics] = true
    options[:num_threads] = nil
  end
  o.on("-q", "--quiet", "Run quietly") do |q|
    # Bio::Log::CLI.trace('error')
    options[:quiet] = true
  end

  o.on("-v", "--verbose", "Run verbosely") do |v|
    options[:verbose] = true
  end

  o.on("--debug", "Show debug messages and keep intermediate output") do |v|
    # Bio::Log::CLI.trace('debug')
    options[:debug] = true
  end

  o.separator ""
  o.on_tail('-h', '--help', 'display this help and exit') do
    options[:show_help] = true
  end
end

opts.parse!(ARGV)

BIOVCF_VERSION=version
BIOVCF_BANNER = "bio-vcf #{version} (biogem Ruby #{RUBY_VERSION} with pcows) by Pjotr Prins 2015-2020\n"
$stderr.print BIOVCF_BANNER if !options[:quiet]

if options[:show_help]
  print opts
  print USAGE
  exit 1
end

if RUBY_VERSION =~ /^1/
  $stderr.print "WARNING: bio-vcf runs on Ruby 2.x only\n"
end

$stderr.print "Options: ",options,"\n" if !options[:quiet]

if options[:template]
  include BioVcf::RDF
  require 'bio-vcf/template'
  fn = options[:template]
  raise "No template #{fn}!" if not File.exist?(fn)
  # template = ERB.new(File.read(fn))
  template = Bio::Template.new(fn)
end

stats = nil
if options[:statistics]
  options[:num_threads] = nil
  stats = BioVcf::VcfStatistics.new
end

# Check for option combinations
raise "Missing option --ifilter" if options[:ifilter_samples] and not options[:ifilter]
raise "Missing option --efilter" if options[:efilter_samples] and not options[:efilter]
raise "Missing option --sfilter" if options[:sfilter_samples] and not options[:sfilter]
# raise "Soft filter not supported with --ifilter" if options[:add_filter] and options[:ifilter]
# raise "Soft filter not supported with --efilter" if options[:add_filter] and options[:efilter]

if options[:samples]
  samples = options[:samples].map { |s| s.to_i }
end

include BioVcf

# Parse the header section of a VCF file (chomping STDIN)
def parse_header line, samples, options
  header = VcfHeader.new(options[:debug])
  header.add(line)
  print line if not options[:skip_header]
  STDIN.each_line do | headerline |
    if headerline !~ /^#/
      # If no records in VCF, we never get here
      line = headerline
      break # end of header
    end
    header.add(headerline)
    if not options[:skip_header]
      if headerline =~ /^#CHR/
        # The header before actual data contains the sample names, first inject the BioVcf meta information
        print header.tag(options),"\n" if options[:tag] and not options[:skip_header]
        # Then the additional filter(s)
        # ##FILTER=<ID=LowQual,Description="Low quality">
        add_filter = options[:add_filter]
        if add_filter
          print "##FILTER=<ID=",add_filter,",Description=\"",options[:filter],"\">\n"
        end

        selected = header.column_names
        if samples
          newfields = selected[0..8]
          samples.each do |s|
            newfields << selected[s+9]
          end
          selected = newfields
        end
        print "#",selected.join("\t"),"\n"
      else
        print headerline
      end
    end
  end
  print header.printable_header_line(options[:set_header]),"\n" if options[:set_header]
  VcfRdf::header if options[:rdf]
  if line =~ /^#/
    # We did not read a record
    line = nil
  end
  return header,line
end

# Parse a VCF line and return the (template) result as a string buffer
def parse_line line,header,options,bedfilter,samples,template,stats=nil
  fields = VcfLine.parse(line)
  rec = VcfRecord.new(fields,header)
  r = rec # alias

  filter = options[:filter]
  sfilter = options[:sfilter]
  efilter = options[:efilter]
  ifilter = options[:ifilter]
  add_filter = options[:add_filter] # contains a filter name (soft filter)
  seval = options[:seval]
  ignore_missing = options[:ignore_missing]
  quiet = options[:quiet]
  set_filter_field = nil

  if sfilter or efilter or ifilter or seval
    # check for samples
    header_samples = header.column_names[9..-1]
    raise "Empty sample list, can not execute query!" if not header_samples
  end

  # --------------------------
  # Filtering and set analysis
  if bedfilter
    bed = bedfilter.contains(rec)
    return if not bed
  end

  skip = lambda { |&m|
    matched = m.call
    if add_filter
      set_filter_field = true if matched
      false  # always continue processing with an add-filter
    else
      not matched
    end
  }

  if filter
    return if skip.call { rec.gfilter(filter,ignore_missing_data: ignore_missing,quiet: quiet) }
  end

  if sfilter # sample 'or' filter
    rec.each_sample(options[:sfilter_samples]) do | sample |
      return if skip.call { sample.sfilter(sfilter,ignore_missing_data: ignore_missing,quiet: quiet) }
    end
  end

  if ifilter # include sample filter
    found = false
    rec.each_sample(options[:ifilter_samples]) do | sample |
      if sample.ifilter(ifilter,ignore_missing_data: ignore_missing,quiet: quiet)
        found = true
        break
      end
    end
    # Skip if there are no matches
    return if skip.call {found}
  end

  if efilter # exclude sample filter
    rec.each_sample(options[:efilter_samples]) do | sample |
      return if skip.call{ sample.efilter(efilter,ignore_missing_data: ignore_missing,quiet: quiet) }
    end
  end

  stats.add(rec) if stats

  # -----------------------------
  # From here on decide on output

  rec.add_to_filter_field(add_filter) if set_filter_field

  if samples
    # Select certain samples for output
    newfields = fields[0..8]
    samples.each do |s|
      newfields << fields[s+9]
    end
    fields = newfields
  end
  if options[:eval] or seval
    begin
      results = nil # result string
      if options[:eval]
        res = rec.eval(options[:eval],ignore_missing_data: ignore_missing,quiet: quiet)
        results = res if res
      end
      if seval
        list = (results ? [] : [rec.chr,rec.pos])
        rec.each_sample(options[:sfilter_samples]) { | sample |
          list << sample.eval(seval,ignore_missing_data: ignore_missing,quiet: quiet)
        }
        results = (results ? results.to_s + "\t" : "" ) + list.join("\t")
      end
    rescue => e
      $stderr.print "\nLine: ",line
      $stderr.print "ERROR evaluating --eval <#{options[:eval]}> #{e.message}\n"
      raise if options[:verbose]
      exit 1
    end
    return results.to_s+"\n" if results
  else
    if options[:rdf]
      # Output Turtle RDF
      VcfRdf::record(options[:id],rec,options[:tags])
    elsif options[:template]
      # Use ERB template
      begin
        template.body(binding)
      rescue Exception => e
        $stderr.print e,": ",fields,"\n"
        $stderr.print e.backtrace.inspect if options[:verbose]
        raise
      end
    elsif options[:rewrite]
      # Default behaviour prints VCF line, but rewrite info
      eval(options[:rewrite])
      (fields[0..6]+[rec.info.to_s]+fields[8..-1]).join("\t")+"\n"
    elsif stats
      # do nothing
    else
      # Default behaviour prints VCF line
      fields.join("\t")+"\n"
    end
  end
end

CHUNK_SIZE = options[:thread_lines]

pcows = PCOWS.new(options[:num_threads],CHUNK_SIZE,'bio-vcf',options[:timeout],
                  options[:quiet],options[:debug])
header = nil
header_output_completed = false
chunk_lines = []
line_number=0

if options[:bed]
  bedfilter = BedFilter.new(options[:bed])
end

begin
  # Define linear parser function (going through one chunk)
  process = lambda { | lines |
    res = []
    lines.each do | line |
      res << parse_line(line,header,options,bedfilter,samples,template,stats)
    end
    res
  }

  # ---- Main loop
  STDIN.each_line do | line |
    line_number += 1

    # ---- Skip embedded headers down the line...
    next if header_output_completed and line =~ /^#/

    # ---- In the following section header information is handled -
    #      this only happens once.

    # ---- Parse the header lines (chomps from STDIN)
    #      and returns header info and the current line
    if line =~ /^#/
      header, line = parse_header(line,samples,options)
      if line.nil?
        # No line after header, to there are no records to process
        break
      end
    end
    # p [line_number,line]
    # ---- After the header continue processing
    if not header_output_completed
      # one-time post-header processing
      if not options[:efilter_samples] and options[:ifilter_samples]
        # Create exclude set as a complement of include set
        options[:efilter_samples] = header.column_names[9..-1].fill{|i|i.to_s}-options[:ifilter_samples]
      end
      print template.header(binding) if template
      header_output_completed = true
    end

    if options[:eval_once]
      # this happens if we only want one line evaluated - say to get
      # the number of samples
      print parse_line(line,header,options,bedfilter,samples,template,stats)
      exit 0
    end

    # ---- Lines are collected in one buffer and the lines buffer
    #      is added to the chunks list (for the threads)
    chunk_lines << line

    # ---- In the following section the VCF lines are parsed by chunks
    #      The chunks may go into different threads

    if chunk_lines.size >= CHUNK_SIZE
      # ---- process one chunk
      $stderr.print '.' if not options[:quiet]
      pcows.wait_for_worker_slot()
      pcows.submit_worker(process,chunk_lines)
      pcows.process_output()

      chunk_lines = []
    end
  end
  pcows.submit_final_worker(process,chunk_lines)
  pcows.wait_for_workers()
  pcows.process_remaining_output()

  print template.footer(binding) if template
  stats.print if stats

rescue Exception => e
  if e.message != 'exit'
    $stderr.print "ERROR: "
    $stderr.print e.message,"\n"
  end
  pcows.cleanup()
  raise if options[:verbose]
  exit 1
end
