Reducing a sequence to SNPs

A script for a simple task.

A largely self-explanatory script. This will "shrink" an alignment, deleting all sites that don't contain a polymorphism in some member sequence. A little bit of script candy as well, this takes any number of files and saves the results in a new file named according to a definable schema:

#!/usr/bin/env ruby
# Reduce a sequence to solely the SNP sites.

### IMPORTS

require 'test/unit/assertions'
require 'optparse'
require 'pp'

require 'ostruct'
require 'date'
require 'time'
require 'bio'
include Test::Unit::Assertions

### CONSTANTS & DEFINES

### IMPLEMENTATION

# complete bit of frivolity which I use to define formats for strings
def interpolate (str, sub_hash)
        return str.gsub (/{([^}]+)}/) { |m|
                sub_hash[$1]
        }
end

### MAIN

# Parse commandline arguments.
def parse_clargs (arg_arr)
        clopts = {
                :save => "{root}-snps.fasta",
                :overwrite => false,
        }
        pargs = []

        OptionParser.new { |opts|
                opts.program_name = __FILE__
                opts.banner = "Reduce a sequence to solely the SNP sites"
                opts.separator ("")
                opts.separator ("Usage: #{opts.program_name} [options] ALN1 ...]")
                opts.on ('-h', '--help', 'Display this screen') {
                        puts opts
                        exit
                }
                opts.on ('', '--save STR',
                        "Name output files according this template") { |v|
                        clopts[:save] = v
                }
                opts.on ('-o', '--overwrite', "Overwrite pre-existing files") {
                        clopts[:overwrite] = true
                }
                begin
                        opts.parse! (arg_arr)
                        pargs = arg_arr
                        assert (1 <= pargs.length, "need files to work on")
                rescue Exception => e
                        error_msg = e.to_str.split(" ")
                        print "Error: #{error_msg[0]}"
                        print opts
                        exit 1
                end
        }

        return clopts, pargs
end

# Main script functionality.
def main()
        clopts, aln_files = parse_clargs (ARGV)

        aln_files.each { |f|
                # slurp ...
                seqs = [] Bio::FlatFile.open(f) { |rdr|
                        seqs = rdr.collect { |rec|
                                rec.to_seq()
                        }
                }
                # calculate diffs diff_hash = {}
                seqs.each { |s|
                        diff_hash[s.entry_id] = ""
                }
        seq_len = seqs[0].length()
        (0...seq_len).each { |i|
                is_diff = false
                seqs.each { |s1|
                        seqs.each { |s2|
                                if s1 != s2
                                        if s1[i,1] != s2[i,1]
                                                is_diff = true
                                                break
                                        end
                                end
                        }
                        if is_diff == true
                                break
                        end
                }
                if is_diff == true
                        seqs.each { |s|
                                diff_hash[s.entry_id] += s[i,1]
                        }
                end
        }

                # write output

                # make filename
                # this is where my overly ornate templating comes in
                ext = File.extname(f)
                subs = {
                        "ext" => ext[1, ext.length],
                        "base" => File.basename(f),
                        "root" => File.basename(f, ext),
                        "date" => Date.today.to_s(),
                        "time" => Time.now.strftime(fmt='%T'),
                        "datetime" => DateTime.now.strftime (fmt='%FT%T'),
                }
                out_name = interpolate(clopts[:save], subs)

                # do the writing
                puts "Saving results to '#{out_name}' ..."
                if File.exists? (out_name)
                        assert (clopts[:overwrite], "Can't overwrite existing file '#{out_name}'")
                end
                File.open(out_name, 'w') { |wrtr|
                        diff_hash.each_pair { |k,v|
                                wrtr << ">#{k}"
                                wrtr << "#{v}"
                        }
                }
                puts "Saved '#{out_name}'."
        }
        puts "== Finished."
end

if $0 == __FILE__
        main()
end

### END