Skip to content

Commit de6af45

Browse files
authored
Merge pull request #3 from cfe-lab/FoldClinicalIn
Consolidate all functionality into one library
2 parents bd0a798 + 3c3b30f commit de6af45

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

63 files changed

+858469
-3362
lines changed

.devcontainer/Dockerfile

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
# Base image for PyEasyHLA development.
2+
3+
# This image is the basis of the project's devcontainer as well.
4+
5+
ARG PYTHON_VERSION="3.13.3-bookworm"
6+
7+
FROM python:${PYTHON_VERSION} AS base
8+
9+
RUN apt update -y && apt upgrade -y
10+
11+
# Install the vendored software.
12+
ARG INSTANTCLIENT_BASIC="instantclient-basic-linux.x64-23.7.0.25.01.zip"
13+
# The value of ORACLE_HOME depends on the instant client used as it will
14+
# install to a path like ".../instantclient_23_7" where the version numbers
15+
# will vary.
16+
ARG ORACLE_HOME="/opt/oracle/instantclient_23_7"
17+
18+
ENV ORACLE_HOME=${ORACLE_HOME} \
19+
LD_LIBRARY_PATH=${ORACLE_HOME}:$LD_LIBRARY_PATH
20+
21+
COPY vendor/${INSTANTCLIENT_BASIC} /tmp/vendor/
22+
RUN unzip /tmp/vendor/${INSTANTCLIENT_BASIC} -d /opt/oracle &&\
23+
rm -rf /tmp/vendor

.devcontainer/devcontainer.json

Lines changed: 18 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
// For format details, see https://aka.ms/devcontainer.json. For config options, see the
22
// README at: https://github.com/devcontainers/templates/tree/main/src/ubuntu
33
{
4-
"image": "python:3.10-slim",
4+
"image": "python:3.13.3-bookworm",
55
"features": {
66
"ghcr.io/devcontainers/features/git:1": {
77
"ppa": true,
@@ -19,7 +19,9 @@
1919
"updateRemoteUserUID": true,
2020
"containerEnv": {
2121
"WORKON_HOME": "${containerWorkspaceFolder}/.venv",
22-
"PYTHONPATH": "${containerWorkspaceFolder}"
22+
"PYTHONPATH": "${containerWorkspaceFolder}",
23+
"ORACLE_HOME": "/opt/oracle/instantclient",
24+
"LD_LIBRARY_PATH": "/opt/oracle/instantclient"
2325
},
2426
"onCreateCommand": {
2527
"update apt": [
@@ -38,7 +40,20 @@
3840
"install",
3941
"--no-install-recommends",
4042
"-y",
41-
"openssh-client"
43+
"openssh-client",
44+
"libaio-dev",
45+
"unzip"
46+
],
47+
"install Instant Client": [
48+
"/usr/bin/env",
49+
"bash",
50+
"${containerWorkspaceFolder}/.devcontainer/install_instant_client.bash"
51+
],
52+
"install Ruby": [
53+
"/usr/bin/apt",
54+
"install",
55+
"-y",
56+
"ruby"
4257
]
4358
},
4459
"postAttachCommand": {
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
#! /usr/bin/env bash
2+
3+
mkdir /tmp/vendor
4+
cp vendor/instantclient*.zip /tmp/vendor/
5+
unzip /tmp/vendor/instantclient*.zip -d /opt/oracle
6+
ln -s /opt/oracle/instantclient_* /opt/oracle/instantclient
7+
rm -rf /tmp/vendor
Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
Dir.chdir "config"
2+
['a','b','c'].each do |letter|
3+
data = []
4+
source_filename = "hla_#{letter}_std.csv"
5+
puts "reading #{source_filename}..."
6+
File.open(source_filename) do |file|
7+
file.each_line do |line|
8+
data << line.strip.split(',')
9+
end
10+
end
11+
12+
data.sort! do |a,b|
13+
tmpa = a[0].split(':')
14+
tmpb = b[0].split(':')
15+
(tmpa[0] != tmpb[0] ? tmpa[0].split('*')[1].to_i <=> tmpb[0].split('*')[1].to_i :
16+
(tmpa[1] != tmpb[1] ? tmpa[1].to_i <=> tmpb[1].to_i :
17+
(tmpa[2] != tmpb[2] ? tmpa[2].to_i <=> tmpb[2].to_i :
18+
tmpa[3].to_i <=> tmpb[3].to_i )))
19+
end
20+
21+
#File.open("hla_std_#{letter}_test.csv", 'w') do |file|
22+
#data.each do |d|
23+
#file.puts d.join(',')
24+
#end
25+
#end
26+
27+
data.each_with_index do |orig, i|
28+
next if(orig == nil)
29+
orig_allele = orig[0]
30+
orig_seq = orig[1 .. 2]
31+
match_count = 0
32+
data.each_with_index do |row, j|
33+
next if(j <= i or row == nil)
34+
if(orig_seq == row[1 .. 2])
35+
match_count += 1
36+
puts "Reducing #{row[0]} into #{orig_allele}"
37+
data[j] = nil
38+
end
39+
end
40+
if(match_count > 0)
41+
gallele = orig_allele.split(':')
42+
gallele = gallele[0 .. 2] if(gallele.size > 3)
43+
gallele = gallele.join(':') + 'G'
44+
puts "Turning #{orig_allele} into #{gallele}"
45+
orig[0] = gallele
46+
end
47+
48+
end
49+
50+
data.delete(nil)
51+
52+
reduced_filename = "hla_#{letter}_std_reduced.csv"
53+
puts "writing #{reduced_filename}"
54+
File.open(reduced_filename, 'w') do |file|
55+
data.each do |d|
56+
file.puts d.join(',')
57+
end
58+
end
59+
60+
File.delete(source_filename)
61+
system('zip', "hla_#{letter}_std_reduced.zip", reduced_filename)
62+
end
Lines changed: 176 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,176 @@
1+
#!/usr/bin/ruby
2+
#
3+
# This program downloads the newest version of hla_nuc.fasta and then
4+
# processes it, creating three standard files for a, b, and c. Any sequence
5+
# that doesn't seem to align properly will be rejected.
6+
# The alignment algorithm I'm using is the dirtiest possible.
7+
# This program requires the bioruby library
8+
9+
10+
#require 'bio'
11+
require 'fileutils'
12+
require 'net/http'
13+
require 'uri'
14+
require 'date'
15+
require 'digest'
16+
require 'json'
17+
18+
#scores the sequence according to how many characters don't match. An
19+
#alignment of 0 means a perfect match. We optimize it by assuming anything under 20 is probably
20+
#a match.
21+
def score(seq, align)
22+
maxscore = align.size
23+
maxseq = -1;
24+
25+
0.upto(seq.size - align.size) do |i|
26+
score = align.size
27+
0.upto(align.size - 1) do |j|
28+
if(align[j] == seq[i + j])
29+
score -= 1
30+
end
31+
end
32+
33+
if(score < maxscore)
34+
maxscore = score
35+
maxseq = i
36+
if(maxscore < 20)
37+
return [maxscore, seq[maxseq, align.size]]
38+
end
39+
end
40+
end
41+
if(maxscore > 30)
42+
# puts maxscore
43+
# puts align
44+
# puts seq[maxseq, align.size]
45+
end
46+
return [maxscore, seq[maxseq, align.size]]
47+
end
48+
49+
#Exon sequences to compare against(used for scoring)
50+
a_exon2_align='GCTCCCACTCCATGAGGTATTTCTTCACATCCGTGTCCCGGCCCGGCCGCGGGGAGCCCCGCTTCATCGCCGTGGGCTACGTGGACGACACGCAGTTCGTGCGGTTCGACAGCGACGCCGCGAGCCAGAAGATGGAGCCGCGGGCGCCGTGGATAGAGCAGGAGGGGCCGGAGTATTGGGACCAGGAGACACGGAATATGAAGGCCCACTCACAGACTGACCGAGCGAACCTGGGGACCCTGCGCGGCTACTACAACCAGAGCGAGGACG'
51+
a_exon3_align='GTTCTCACACCATCCAGATAATGTATGGCTGCGACGTGGGGCCGGACGGGCGCTTCCTCCGCGGGTACCGGCAGGACGCCTACGACGGCAAGGATTACATCGCCCTGAACGAGGACCTGCGCTCTTGGACCGCGGCGGACATGGCAGCTCAGATCACCAAGCGCAAGTGGGAGGCGGTCCATGCGGCGGAGCAGCGGAGAGTCTACCTGGAGGGCCGGTGCGTGGACGGGCTCCGCAGATACCTGGAGAACGGGAAGGAGACGCTGCAGCGCACGG'
52+
b_exon2_align='GCTCCCACTCCATGAGGTATTTCTACACCTCCGTGTCCCGGCCCGGCCGCGGGGAGCCCCGCTTCATCTCAGTGGGCTACGTGGACGACACCCAGTTCGTGAGGTTCGACAGCGACGCCGCGAGTCCGAGAGAGGAGCCGCGGGCGCCGTGGATAGAGCAGGAGGGGCCGGAGTATTGGGACCGGAACACACAGATCTACAAGGCCCAGGCACAGACTGACCGAGAGAGCCTGCGGAACCTGCGCGGCTACTACAACCAGAGCGAGGCCG'
53+
b_exon3_align='GGTCTCACACCCTCCAGAGCATGTACGGCTGCGACGTGGGGCCGGACGGGCGCCTCCTCCGCGGGCATGACCAGTACGCCTACGACGGCAAGGATTACATCGCCCTGAACGAGGACCTGCGCTCCTGGACCGCCGCGGACACGGCGGCTCAGATCACCCAGCGCAAGTGGGAGGCGGCCCGTGAGGCGGAGCAGCGGAGAGCCTACCTGGAGGGCGAGTGCGTGGAGTGGCTCCGCAGATACCTGGAGAACGGGAAGGACAAGCTGGAGCGCGCTG'
54+
c_exon2_align='GCTCCCACTCCATGAAGTATTTCTTCACATCCGTGTCCCGGCCTGGCCGCGGAGAGCCCCGCTTCATCTCAGTGGGCTACGTGGACGACACGCAGTTCGTGCGGTTCGACAGCGACGCCGCGAGTCCGAGAGGGGAGCCGCGGGCGCCGTGGGTGGAGCAGGAGGGGCCGGAGTATTGGGACCGGGAGACACAGAAGTACAAGCGCCAGGCACAGACTGACCGAGTGAGCCTGCGGAACCTGCGCGGCTACTACAACCAGAGCGAGGCCG'
55+
c_exon3_align='GGTCTCACACCCTCCAGTGGATGTGTGGCTGCGACCTGGGGCCCGACGGGCGCCTCCTCCGCGGGTATGACCAGTACGCCTACGACGGCAAGGATTACATCGCCCTGAACGAGGACCTGCGCTCCTGGACCGCCGCGGACACCGCGGCTCAGATCACCCAGCGCAAGTGGGAGGCGGCCCGTGAGGCGGAGCAGCGGAGAGCCTACCTGGAGGGCACGTGCGTGGAGTGGCTCCGCAGATACCTGGAGAACGGGAAGGAGACGCTGCAGCGCGCGG'
56+
57+
Dir.chdir "config"
58+
filename = 'hla_nuc.fasta'
59+
timestamp_path = filename + '.mtime'
60+
61+
repo_path = "https://raw.githubusercontent.com/ANHIG/IMGTHLA"
62+
63+
# Find latest release at https://github.com/ANHIG/IMGTHLA/releases
64+
hla_nuc_version = File.read('hla_nuc.fasta.version.txt').strip
65+
puts "attempting to download version #{hla_nuc_version} from #{repo_path}"
66+
67+
uri = URI.parse("#{repo_path}/#{hla_nuc_version}/hla_nuc.fasta")
68+
response = Net::HTTP.get_response(uri)
69+
md5 = Digest::MD5.new
70+
Net::HTTP.get_response(uri) do |response|
71+
response.value # Raise error if not 200 response code.
72+
File.open(filename, 'w') do |file|
73+
response.read_body do |segment|
74+
file.write(segment)
75+
md5 << segment
76+
end
77+
end
78+
end
79+
checksum_report = md5.hexdigest + ' ' + filename + "\n"
80+
File.write('hla_nuc.fasta.checksum.txt', checksum_report)
81+
puts "Parsing " + filename;
82+
83+
hla_a = []
84+
hla_b = []
85+
hla_c = []
86+
87+
diff_reject = 32
88+
89+
90+
#file = Bio::FastaFormat.open(filename)
91+
fasta = []
92+
enu=[]
93+
File.open(filename) do |file|
94+
file.each_line do |line|
95+
if(line =~ /^>/)
96+
fasta.push(enu)
97+
enu = [line.strip, '']
98+
else
99+
enu[1] += line.strip
100+
end
101+
end
102+
fasta.push(enu)
103+
end
104+
105+
fasta.delete_if{|e| e== []}
106+
107+
bar_width = 50
108+
fasta.each_with_index do |entry, index| #for each fasta sequence
109+
#title = entry.definition[entry.definition.index(' ') + 1 .. entry.definition.size]
110+
title = entry[0].split(' ')[1]
111+
type = title[0, 1]
112+
data = entry[1]
113+
progress_bar = ('#' * (index.to_f/fasta.length*bar_width) + '.' * bar_width)
114+
progress_bar = progress_bar[0...bar_width]
115+
116+
data.tr!("\n\t\r ", '') #get rid of whitespace
117+
message = ''
118+
119+
if(type == 'A')
120+
exon2 = score(data, a_exon2_align)
121+
exon3 = score(data, a_exon3_align)
122+
if(exon2[0] <= diff_reject and exon3[0] <= diff_reject)
123+
message = "Approving " + title + ": " + exon2[0].to_s + " " + exon3[0].to_s
124+
hla_a.push( [title, exon2[1], exon3[1]] )
125+
else
126+
message = "***Rejecting " + title + ": " + exon2[0].to_s + " " + exon3[0].to_s
127+
end
128+
elsif(type == 'B')
129+
exon2 = score(data, b_exon2_align)
130+
exon3 = score(data, b_exon3_align)
131+
if(exon2[0] <= diff_reject and exon3[0] <= diff_reject)
132+
message = "Approving " + title + ": " + exon2[0].to_s + " " + exon3[0].to_s
133+
hla_b.push( [title, exon2[1], exon3[1]] )
134+
else
135+
message = "***Rejecting " + title + ": " + exon2[0].to_s + " " + exon3[0].to_s
136+
end
137+
elsif(type == 'C')
138+
exon2 = score(data, c_exon2_align)
139+
exon3 = score(data, c_exon3_align)
140+
if(exon2[0] <= diff_reject and exon3[0] <= diff_reject)
141+
message = "Approving " + title + ": " + exon2[0].to_s + " " + exon3[0].to_s
142+
hla_c.push( [title, exon2[1], exon3[1]] )
143+
else
144+
message = "***Rejecting " + title + ": " + exon2[0].to_s + " " + exon3[0].to_s
145+
end
146+
end
147+
print "\r" + progress_bar + ' ' + message + ". "
148+
end
149+
puts "\r" + '#' * bar_width + ' Completed.' + ' ' * 18
150+
151+
#file.close
152+
153+
#Lets sort, just to make things easier for our eyes
154+
hla_a.sort!
155+
hla_b.sort!
156+
hla_c.sort!
157+
158+
File.open('hla_a_std.csv', 'w') do |file|
159+
hla_a.each do |entry|
160+
file.puts entry[0] + "," + entry[1] + "," + entry[2]
161+
end
162+
end
163+
164+
File.open('hla_b_std.csv', 'w') do |file|
165+
hla_b.each do |entry|
166+
file.puts entry[0] + "," + entry[1] + "," + entry[2]
167+
end
168+
end
169+
170+
File.open('hla_c_std.csv', 'w') do |file|
171+
hla_c.each do |entry|
172+
file.puts entry[0] + "," + entry[1] + "," + entry[2]
173+
end
174+
end
175+
176+
File.delete(filename)

0 commit comments

Comments
 (0)