Reducing a sequence to SNPs
A script for a simple task.
A largely self-explanatory script. This will "shrink" an alignment, deleting all sites that don't contain a polymorphism in some member sequence. A little bit of script candy as well, this takes any number of files and saves the results in a new file named according to a definable schema:
#!/usr/bin/env ruby # Reduce a sequence to solely the SNP sites. ### IMPORTS require 'test/unit/assertions' require 'optparse' require 'pp' require 'ostruct' require 'date' require 'time' require 'bio' include Test::Unit::Assertions ### CONSTANTS & DEFINES ### IMPLEMENTATION # complete bit of frivolity which I use to define formats for strings def interpolate (str, sub_hash) return str.gsub (/{([^}]+)}/) { |m| sub_hash[$1] } end ### MAIN # Parse commandline arguments. def parse_clargs (arg_arr) clopts = { :save => "{root}-snps.fasta", :overwrite => false, } pargs = [] OptionParser.new { |opts| opts.program_name = __FILE__ opts.banner = "Reduce a sequence to solely the SNP sites" opts.separator ("") opts.separator ("Usage: #{opts.program_name} [options] ALN1 ...]") opts.on ('-h', '--help', 'Display this screen') { puts opts exit } opts.on ('', '--save STR', "Name output files according this template") { |v| clopts[:save] = v } opts.on ('-o', '--overwrite', "Overwrite pre-existing files") { clopts[:overwrite] = true } begin opts.parse! (arg_arr) pargs = arg_arr assert (1 <= pargs.length, "need files to work on") rescue Exception => e error_msg = e.to_str.split(" ") print "Error: #{error_msg[0]}" print opts exit 1 end } return clopts, pargs end # Main script functionality. def main() clopts, aln_files = parse_clargs (ARGV) aln_files.each { |f| # slurp ... seqs = [] Bio::FlatFile.open(f) { |rdr| seqs = rdr.collect { |rec| rec.to_seq() } } # calculate diffs diff_hash = {} seqs.each { |s| diff_hash[s.entry_id] = "" } seq_len = seqs[0].length() (0...seq_len).each { |i| is_diff = false seqs.each { |s1| seqs.each { |s2| if s1 != s2 if s1[i,1] != s2[i,1] is_diff = true break end end } if is_diff == true break end } if is_diff == true seqs.each { |s| diff_hash[s.entry_id] += s[i,1] } end } # write output # make filename # this is where my overly ornate templating comes in ext = File.extname(f) subs = { "ext" => ext[1, ext.length], "base" => File.basename(f), "root" => File.basename(f, ext), "date" => Date.today.to_s(), "time" => Time.now.strftime(fmt='%T'), "datetime" => DateTime.now.strftime (fmt='%FT%T'), } out_name = interpolate(clopts[:save], subs) # do the writing puts "Saving results to '#{out_name}' ..." if File.exists? (out_name) assert (clopts[:overwrite], "Can't overwrite existing file '#{out_name}'") end File.open(out_name, 'w') { |wrtr| diff_hash.each_pair { |k,v| wrtr << ">#{k}" wrtr << "#{v}" } } puts "Saved '#{out_name}'." } puts "== Finished." end if $0 == __FILE__ main() end ### END