|
| 1 | +#!/usr/bin/ruby |
| 2 | +# |
| 3 | +# This program downloads the newest version of hla_nuc.fasta and then |
| 4 | +# processes it, creating three standard files for a, b, and c. Any sequence |
| 5 | +# that doesn't seem to align properly will be rejected. |
| 6 | +# The alignment algorithm I'm using is the dirtiest possible. |
| 7 | +# This program requires the bioruby library |
| 8 | + |
| 9 | + |
| 10 | +#require 'bio' |
| 11 | +require 'fileutils' |
| 12 | +require 'net/http' |
| 13 | +require 'uri' |
| 14 | +require 'date' |
| 15 | +require 'digest' |
| 16 | +require 'json' |
| 17 | + |
| 18 | +#scores the sequence according to how many characters don't match. An |
| 19 | +#alignment of 0 means a perfect match. We optimize it by assuming anything under 20 is probably |
| 20 | +#a match. |
| 21 | +def score(seq, align) |
| 22 | + maxscore = align.size |
| 23 | + maxseq = -1; |
| 24 | + |
| 25 | + 0.upto(seq.size - align.size) do |i| |
| 26 | + score = align.size |
| 27 | + 0.upto(align.size - 1) do |j| |
| 28 | + if(align[j] == seq[i + j]) |
| 29 | + score -= 1 |
| 30 | + end |
| 31 | + end |
| 32 | + |
| 33 | + if(score < maxscore) |
| 34 | + maxscore = score |
| 35 | + maxseq = i |
| 36 | + if(maxscore < 20) |
| 37 | + return [maxscore, seq[maxseq, align.size]] |
| 38 | + end |
| 39 | + end |
| 40 | + end |
| 41 | + if(maxscore > 30) |
| 42 | +# puts maxscore |
| 43 | +# puts align |
| 44 | +# puts seq[maxseq, align.size] |
| 45 | + end |
| 46 | + return [maxscore, seq[maxseq, align.size]] |
| 47 | +end |
| 48 | + |
| 49 | +#Exon sequences to compare against(used for scoring) |
| 50 | +a_exon2_align='GCTCCCACTCCATGAGGTATTTCTTCACATCCGTGTCCCGGCCCGGCCGCGGGGAGCCCCGCTTCATCGCCGTGGGCTACGTGGACGACACGCAGTTCGTGCGGTTCGACAGCGACGCCGCGAGCCAGAAGATGGAGCCGCGGGCGCCGTGGATAGAGCAGGAGGGGCCGGAGTATTGGGACCAGGAGACACGGAATATGAAGGCCCACTCACAGACTGACCGAGCGAACCTGGGGACCCTGCGCGGCTACTACAACCAGAGCGAGGACG' |
| 51 | +a_exon3_align='GTTCTCACACCATCCAGATAATGTATGGCTGCGACGTGGGGCCGGACGGGCGCTTCCTCCGCGGGTACCGGCAGGACGCCTACGACGGCAAGGATTACATCGCCCTGAACGAGGACCTGCGCTCTTGGACCGCGGCGGACATGGCAGCTCAGATCACCAAGCGCAAGTGGGAGGCGGTCCATGCGGCGGAGCAGCGGAGAGTCTACCTGGAGGGCCGGTGCGTGGACGGGCTCCGCAGATACCTGGAGAACGGGAAGGAGACGCTGCAGCGCACGG' |
| 52 | +b_exon2_align='GCTCCCACTCCATGAGGTATTTCTACACCTCCGTGTCCCGGCCCGGCCGCGGGGAGCCCCGCTTCATCTCAGTGGGCTACGTGGACGACACCCAGTTCGTGAGGTTCGACAGCGACGCCGCGAGTCCGAGAGAGGAGCCGCGGGCGCCGTGGATAGAGCAGGAGGGGCCGGAGTATTGGGACCGGAACACACAGATCTACAAGGCCCAGGCACAGACTGACCGAGAGAGCCTGCGGAACCTGCGCGGCTACTACAACCAGAGCGAGGCCG' |
| 53 | +b_exon3_align='GGTCTCACACCCTCCAGAGCATGTACGGCTGCGACGTGGGGCCGGACGGGCGCCTCCTCCGCGGGCATGACCAGTACGCCTACGACGGCAAGGATTACATCGCCCTGAACGAGGACCTGCGCTCCTGGACCGCCGCGGACACGGCGGCTCAGATCACCCAGCGCAAGTGGGAGGCGGCCCGTGAGGCGGAGCAGCGGAGAGCCTACCTGGAGGGCGAGTGCGTGGAGTGGCTCCGCAGATACCTGGAGAACGGGAAGGACAAGCTGGAGCGCGCTG' |
| 54 | +c_exon2_align='GCTCCCACTCCATGAAGTATTTCTTCACATCCGTGTCCCGGCCTGGCCGCGGAGAGCCCCGCTTCATCTCAGTGGGCTACGTGGACGACACGCAGTTCGTGCGGTTCGACAGCGACGCCGCGAGTCCGAGAGGGGAGCCGCGGGCGCCGTGGGTGGAGCAGGAGGGGCCGGAGTATTGGGACCGGGAGACACAGAAGTACAAGCGCCAGGCACAGACTGACCGAGTGAGCCTGCGGAACCTGCGCGGCTACTACAACCAGAGCGAGGCCG' |
| 55 | +c_exon3_align='GGTCTCACACCCTCCAGTGGATGTGTGGCTGCGACCTGGGGCCCGACGGGCGCCTCCTCCGCGGGTATGACCAGTACGCCTACGACGGCAAGGATTACATCGCCCTGAACGAGGACCTGCGCTCCTGGACCGCCGCGGACACCGCGGCTCAGATCACCCAGCGCAAGTGGGAGGCGGCCCGTGAGGCGGAGCAGCGGAGAGCCTACCTGGAGGGCACGTGCGTGGAGTGGCTCCGCAGATACCTGGAGAACGGGAAGGAGACGCTGCAGCGCGCGG' |
| 56 | + |
| 57 | +Dir.chdir "config" |
| 58 | +filename = 'hla_nuc.fasta' |
| 59 | +timestamp_path = filename + '.mtime' |
| 60 | + |
| 61 | +repo_path = "https://raw.githubusercontent.com/ANHIG/IMGTHLA" |
| 62 | + |
| 63 | +# Find latest release at https://github.com/ANHIG/IMGTHLA/releases |
| 64 | +hla_nuc_version = File.read('hla_nuc.fasta.version.txt').strip |
| 65 | +puts "attempting to download version #{hla_nuc_version} from #{repo_path}" |
| 66 | + |
| 67 | +uri = URI.parse("#{repo_path}/#{hla_nuc_version}/hla_nuc.fasta") |
| 68 | +response = Net::HTTP.get_response(uri) |
| 69 | +md5 = Digest::MD5.new |
| 70 | +Net::HTTP.get_response(uri) do |response| |
| 71 | + response.value # Raise error if not 200 response code. |
| 72 | + File.open(filename, 'w') do |file| |
| 73 | + response.read_body do |segment| |
| 74 | + file.write(segment) |
| 75 | + md5 << segment |
| 76 | + end |
| 77 | + end |
| 78 | +end |
| 79 | +checksum_report = md5.hexdigest + ' ' + filename + "\n" |
| 80 | +File.write('hla_nuc.fasta.checksum.txt', checksum_report) |
| 81 | +puts "Parsing " + filename; |
| 82 | + |
| 83 | +hla_a = [] |
| 84 | +hla_b = [] |
| 85 | +hla_c = [] |
| 86 | + |
| 87 | +diff_reject = 32 |
| 88 | + |
| 89 | + |
| 90 | +#file = Bio::FastaFormat.open(filename) |
| 91 | +fasta = [] |
| 92 | +enu=[] |
| 93 | +File.open(filename) do |file| |
| 94 | + file.each_line do |line| |
| 95 | + if(line =~ /^>/) |
| 96 | + fasta.push(enu) |
| 97 | + enu = [line.strip, ''] |
| 98 | + else |
| 99 | + enu[1] += line.strip |
| 100 | + end |
| 101 | + end |
| 102 | + fasta.push(enu) |
| 103 | +end |
| 104 | + |
| 105 | +fasta.delete_if{|e| e== []} |
| 106 | + |
| 107 | +bar_width = 50 |
| 108 | +fasta.each_with_index do |entry, index| #for each fasta sequence |
| 109 | + #title = entry.definition[entry.definition.index(' ') + 1 .. entry.definition.size] |
| 110 | + title = entry[0].split(' ')[1] |
| 111 | + type = title[0, 1] |
| 112 | + data = entry[1] |
| 113 | + progress_bar = ('#' * (index.to_f/fasta.length*bar_width) + '.' * bar_width) |
| 114 | + progress_bar = progress_bar[0...bar_width] |
| 115 | + |
| 116 | + data.tr!("\n\t\r ", '') #get rid of whitespace |
| 117 | + message = '' |
| 118 | + |
| 119 | + if(type == 'A') |
| 120 | + exon2 = score(data, a_exon2_align) |
| 121 | + exon3 = score(data, a_exon3_align) |
| 122 | + if(exon2[0] <= diff_reject and exon3[0] <= diff_reject) |
| 123 | + message = "Approving " + title + ": " + exon2[0].to_s + " " + exon3[0].to_s |
| 124 | + hla_a.push( [title, exon2[1], exon3[1]] ) |
| 125 | + else |
| 126 | + message = "***Rejecting " + title + ": " + exon2[0].to_s + " " + exon3[0].to_s |
| 127 | + end |
| 128 | + elsif(type == 'B') |
| 129 | + exon2 = score(data, b_exon2_align) |
| 130 | + exon3 = score(data, b_exon3_align) |
| 131 | + if(exon2[0] <= diff_reject and exon3[0] <= diff_reject) |
| 132 | + message = "Approving " + title + ": " + exon2[0].to_s + " " + exon3[0].to_s |
| 133 | + hla_b.push( [title, exon2[1], exon3[1]] ) |
| 134 | + else |
| 135 | + message = "***Rejecting " + title + ": " + exon2[0].to_s + " " + exon3[0].to_s |
| 136 | + end |
| 137 | + elsif(type == 'C') |
| 138 | + exon2 = score(data, c_exon2_align) |
| 139 | + exon3 = score(data, c_exon3_align) |
| 140 | + if(exon2[0] <= diff_reject and exon3[0] <= diff_reject) |
| 141 | + message = "Approving " + title + ": " + exon2[0].to_s + " " + exon3[0].to_s |
| 142 | + hla_c.push( [title, exon2[1], exon3[1]] ) |
| 143 | + else |
| 144 | + message = "***Rejecting " + title + ": " + exon2[0].to_s + " " + exon3[0].to_s |
| 145 | + end |
| 146 | + end |
| 147 | + print "\r" + progress_bar + ' ' + message + ". " |
| 148 | +end |
| 149 | +puts "\r" + '#' * bar_width + ' Completed.' + ' ' * 18 |
| 150 | + |
| 151 | +#file.close |
| 152 | + |
| 153 | +#Lets sort, just to make things easier for our eyes |
| 154 | +hla_a.sort! |
| 155 | +hla_b.sort! |
| 156 | +hla_c.sort! |
| 157 | + |
| 158 | +File.open('hla_a_std.csv', 'w') do |file| |
| 159 | + hla_a.each do |entry| |
| 160 | + file.puts entry[0] + "," + entry[1] + "," + entry[2] |
| 161 | + end |
| 162 | +end |
| 163 | + |
| 164 | +File.open('hla_b_std.csv', 'w') do |file| |
| 165 | + hla_b.each do |entry| |
| 166 | + file.puts entry[0] + "," + entry[1] + "," + entry[2] |
| 167 | + end |
| 168 | +end |
| 169 | + |
| 170 | +File.open('hla_c_std.csv', 'w') do |file| |
| 171 | + hla_c.each do |entry| |
| 172 | + file.puts entry[0] + "," + entry[1] + "," + entry[2] |
| 173 | + end |
| 174 | +end |
| 175 | + |
| 176 | +File.delete(filename) |
0 commit comments