1 | #! /usr/bin/env ruby |
---|
2 | |
---|
3 | $MAFFTCOMMAND = '"/usr/local/bin/mafft"' |
---|
4 | # Edit the above line to specify the location of mafft. |
---|
5 | # $MAFFTCOMMAND = '"C:\folder name\mafft.bat"' # windows |
---|
6 | # $MAFFTCOMMAND = '"/usr/local/bin/mafft"' # mac or cygwin |
---|
7 | # $MAFFTCOMMAND = '"/usr/bin/mafft"' # linux (rpm) |
---|
8 | # $MAFFTCOMMAND = '"/somewhere/mafft.bat"' # all-in-one version for linux or mac |
---|
9 | |
---|
10 | ##################################################################### |
---|
11 | # |
---|
12 | # regionalrealignment.rb version 0.1 (2013/Jul/12) |
---|
13 | # ruby regionalrealignment.rb setting input > output |
---|
14 | # See http://mafft.cbrc.jp/alignment/software/regionalrealignment.html |
---|
15 | # |
---|
16 | ##################################################################### |
---|
17 | |
---|
18 | |
---|
19 | def readfasta( fp, name, seq ) |
---|
20 | nseq = 0 |
---|
21 | tmpseq = "" |
---|
22 | while fp.gets |
---|
23 | if $_ =~ /^>/ then |
---|
24 | name.push( $_.sub(/>/,"").strip ) |
---|
25 | seq.push( tmpseq ) if nseq > 0 |
---|
26 | nseq += 1 |
---|
27 | tmpseq = "" |
---|
28 | else |
---|
29 | tmpseq += $_.strip |
---|
30 | end |
---|
31 | end |
---|
32 | seq.push( tmpseq ) |
---|
33 | return nseq |
---|
34 | end |
---|
35 | |
---|
36 | def resolve( tree ) |
---|
37 | while 1 |
---|
38 | # p tree |
---|
39 | tree.sub!( /\,([0-9]+):(\-?[0-9\.]+)\,([0-9]+):(\-?[0-9\.]+)/, ",XXX" ) |
---|
40 | hit1 = $1 |
---|
41 | hit2 = $2 |
---|
42 | hit3 = $3 |
---|
43 | hit4 = $4 |
---|
44 | |
---|
45 | # p hit1 |
---|
46 | # p hit2 |
---|
47 | # p hit3 |
---|
48 | # p hit4 |
---|
49 | |
---|
50 | # puts "introduce XXX" |
---|
51 | # p tree |
---|
52 | |
---|
53 | break unless tree.index(/XXX/) |
---|
54 | |
---|
55 | poshit = tree.index(/XXX/) |
---|
56 | # puts "poshit=" + poshit.to_s |
---|
57 | |
---|
58 | i = poshit |
---|
59 | height = 0 |
---|
60 | while i >= 0 |
---|
61 | break if height == 0 && tree[i..i] == '(' |
---|
62 | if tree[i..i] == ')' then |
---|
63 | height += 1 |
---|
64 | elsif tree[i..i] == '(' then |
---|
65 | height -= 1 |
---|
66 | end |
---|
67 | i -= 1 |
---|
68 | end |
---|
69 | |
---|
70 | poskakko = i |
---|
71 | # puts "poskakko = " + poskakko.to_s |
---|
72 | zenhan = tree[0..poskakko] |
---|
73 | zenhan = "" if poskakko == -1 |
---|
74 | # puts "zenhan = " + zenhan |
---|
75 | |
---|
76 | treelen = tree.length |
---|
77 | tree = zenhan + "(" + tree[poskakko+1..treelen] |
---|
78 | # puts "add (" |
---|
79 | # p tree |
---|
80 | tree.sub!( /XXX/, "#{hit1}:#{hit2}):0,#{hit3}:#{hit4}" ) |
---|
81 | |
---|
82 | # p tree |
---|
83 | end |
---|
84 | |
---|
85 | |
---|
86 | return tree |
---|
87 | |
---|
88 | end |
---|
89 | |
---|
90 | if ARGV.length != 2 then |
---|
91 | STDERR.puts "" |
---|
92 | STDERR.puts "Usage: ruby #{$0} setingfile inputfile > output" |
---|
93 | STDERR.puts "" |
---|
94 | exit 1 |
---|
95 | end |
---|
96 | |
---|
97 | infilename = ARGV[1] |
---|
98 | tname = [] |
---|
99 | tseq = [] |
---|
100 | infp = File.open( infilename, "r" ) |
---|
101 | tin = readfasta( infp, tname, tseq ) |
---|
102 | infp.close |
---|
103 | |
---|
104 | if tin == 0 then |
---|
105 | STDERR.puts "" |
---|
106 | STDERR.puts "Error in the '#{infilename}' file. Is this FASTA format?\n" |
---|
107 | STDERR.puts "" |
---|
108 | exit 1 |
---|
109 | end |
---|
110 | |
---|
111 | alnlen = tseq[0].length |
---|
112 | if alnlen == 0 then |
---|
113 | STDERR.puts "" |
---|
114 | STDERR.puts "Error in the '#{infilename}' file. Is this FASTA format?\n" |
---|
115 | STDERR.puts "" |
---|
116 | exit 1 |
---|
117 | end |
---|
118 | |
---|
119 | |
---|
120 | for i in 0..(tin-1) |
---|
121 | if alnlen != tseq[i].length then |
---|
122 | STDERR.puts "" |
---|
123 | STDERR.puts "Please insert gaps such that all the input sequences have the same length.\n" |
---|
124 | STDERR.puts "" |
---|
125 | exit 1 |
---|
126 | end |
---|
127 | end |
---|
128 | |
---|
129 | checkmap = [] |
---|
130 | for i in 0..(alnlen-1) |
---|
131 | checkmap.push(0) |
---|
132 | end |
---|
133 | |
---|
134 | outputseq = [] |
---|
135 | for i in 0..(tin-1) |
---|
136 | outputseq.push("") |
---|
137 | end |
---|
138 | |
---|
139 | |
---|
140 | settingfile = ARGV[0].to_s |
---|
141 | reg = [] |
---|
142 | startpos = [] |
---|
143 | endpos = [] |
---|
144 | realign = [] |
---|
145 | options = [] |
---|
146 | treeoption = "" |
---|
147 | revwarn = 0 |
---|
148 | sfp = File.open( settingfile, "r" ) |
---|
149 | while line = sfp.gets |
---|
150 | line.sub!(/#.*/,"") |
---|
151 | next if line.length < 2 |
---|
152 | if line.strip =~ /^treeoption / then |
---|
153 | treeoption = line.strip.sub(/.*treeoption/,"") |
---|
154 | break |
---|
155 | end |
---|
156 | end |
---|
157 | sfp.close |
---|
158 | sfp = File.open( settingfile, "r" ) |
---|
159 | while line = sfp.gets |
---|
160 | line.sub!(/#.*/,"") |
---|
161 | next if line.length < 2 |
---|
162 | next if line.strip =~ /^treeoption/ |
---|
163 | startposv = line.split(' ')[0].to_i - 1 |
---|
164 | endposv = line.split(' ')[1].to_i - 1 |
---|
165 | if startposv < 0 || endposv < 0 then |
---|
166 | STDERR.puts "\nError in the '#{settingfile}' file. Please check this line:\n" |
---|
167 | STDERR.puts line |
---|
168 | STDERR.puts "Sites must be numbered as 1, 2, ...\n" |
---|
169 | STDERR.puts "\n" |
---|
170 | exit 1 |
---|
171 | end |
---|
172 | if startposv >= alnlen || endposv >= alnlen then |
---|
173 | STDERR.puts "\nError in the '#{settingfile}' file. Please check this line:\n" |
---|
174 | STDERR.puts line |
---|
175 | STDERR.puts "Sites must be numbered as 1, 2, ... #{alnlen}\n" |
---|
176 | STDERR.puts "\n" |
---|
177 | exit 1 |
---|
178 | end |
---|
179 | if startposv > endposv then |
---|
180 | STDERR.puts "\nWarning. Please check this line:\n" |
---|
181 | STDERR.puts line |
---|
182 | STDERR.puts "Start position > End position ?\n" |
---|
183 | STDERR.puts "\n" |
---|
184 | revwarn = 1 |
---|
185 | # exit 1 |
---|
186 | end |
---|
187 | startpos.push( startposv ) |
---|
188 | endpos.push( endposv ) |
---|
189 | if startposv > endposv |
---|
190 | for k in (endposv)..(startposv) |
---|
191 | checkmap[k] += 1 |
---|
192 | end |
---|
193 | else |
---|
194 | for k in (startposv)..(endposv) |
---|
195 | checkmap[k] += 1 |
---|
196 | end |
---|
197 | end |
---|
198 | if line.split(' ')[2] == "realign" then |
---|
199 | realign.push( 1 ) |
---|
200 | elsif line.split(' ')[2] == "preserve" then |
---|
201 | realign.push( 0 ) |
---|
202 | else |
---|
203 | STDERR.puts "\n" |
---|
204 | STDERR.puts "The third column must be 'realign' or 'preserve'\n" |
---|
205 | STDERR.puts "Please check this line:\n" |
---|
206 | STDERR.puts line |
---|
207 | STDERR.puts "\n" |
---|
208 | exit 1 |
---|
209 | end |
---|
210 | if line =~ / \-\-/ && line =~ /realign/ then |
---|
211 | options.push( line.sub(/.*realign/,"").strip ) |
---|
212 | else |
---|
213 | options.push( treeoption ) |
---|
214 | end |
---|
215 | end |
---|
216 | sfp.close |
---|
217 | |
---|
218 | #p startpos |
---|
219 | #p endpos |
---|
220 | #p options |
---|
221 | |
---|
222 | |
---|
223 | #res = system "#{$MAFFTCOMMAND} #{treeoption} --treeout --retree 0 --thread -1 #{infilename} > _dum" |
---|
224 | res = system "#{$MAFFTCOMMAND} #{treeoption} --treeout --retree 0 #{infilename} > _dum" |
---|
225 | |
---|
226 | if res == false then |
---|
227 | STDERR.puts "\n" |
---|
228 | STDERR.puts "ERROR in building a guide tree" |
---|
229 | STDERR.puts "\n" |
---|
230 | exit 1 |
---|
231 | end |
---|
232 | |
---|
233 | treefp = File.open( "#{infilename}.tree", "r" ) |
---|
234 | |
---|
235 | tree = "" |
---|
236 | while line = treefp.gets |
---|
237 | tree += line.strip |
---|
238 | break if tree =~ /;$/ |
---|
239 | end |
---|
240 | treefp.close |
---|
241 | |
---|
242 | tree = tree.gsub( /_.*?:/, ":" ).gsub(/[0-9]\.[0-9]*e-[0-9][0-9]/, "0").gsub(/\[.*?\]/,"").gsub(/ /, "") |
---|
243 | scale = 1.0 |
---|
244 | mtreefp = File.open("_tree", "w") |
---|
245 | |
---|
246 | |
---|
247 | STDERR.puts "Tree = " + tree |
---|
248 | |
---|
249 | memi = [-1,-1] |
---|
250 | leni = [-1,-1] |
---|
251 | |
---|
252 | while tree.index( /\(/ ) |
---|
253 | |
---|
254 | tree = resolve( tree ) |
---|
255 | |
---|
256 | tree.sub!( /\(([0-9]+):(\-?[0-9\.]+),([0-9]+):(\-?[0-9\.]+)\)/, "XXX" ) |
---|
257 | memi[0] = $1.to_i |
---|
258 | leni[0] = $2.to_f * scale |
---|
259 | memi[1] = $3.to_i |
---|
260 | leni[1] = $4.to_f * scale |
---|
261 | |
---|
262 | if leni[0] > 10 || leni[1] > 10 then |
---|
263 | STDERR.puts "" |
---|
264 | STDERR.puts "Please check the scale of branch length!" |
---|
265 | STDERR.puts "The unit of branch lengths must be 'substitution/site'" |
---|
266 | STDERR.puts "If the unit is 'substition' in your tree, please" |
---|
267 | STDERR.puts "use the scale argument," |
---|
268 | STDERR.puts "% newick2mafft scale in > out" |
---|
269 | STDERR.puts "where scale = 1/(alignment length)" |
---|
270 | STDERR.puts "" |
---|
271 | exit 1 |
---|
272 | end |
---|
273 | |
---|
274 | # STDERR.puts "subtree = " + $& |
---|
275 | |
---|
276 | if memi[1] < memi[0] then |
---|
277 | memi.reverse! |
---|
278 | leni.reverse! |
---|
279 | end |
---|
280 | |
---|
281 | tree.sub!( /XXX/, memi[0].to_s ) |
---|
282 | |
---|
283 | # STDERR.puts "Tree = " + tree |
---|
284 | |
---|
285 | mtreefp.printf( "%5d %5d %10.5f %10.5f\n", memi[0], memi[1], leni[0], leni[1] ) |
---|
286 | |
---|
287 | end |
---|
288 | |
---|
289 | |
---|
290 | mtreefp.close |
---|
291 | |
---|
292 | numreg = startpos.length |
---|
293 | |
---|
294 | for i in 0..(numreg-1) |
---|
295 | |
---|
296 | partfp = File.open( "_part", "w" ) |
---|
297 | for j in 0..(tin-1) |
---|
298 | partfp.puts ">" + tname[j] |
---|
299 | if startpos[i] > endpos[i] then |
---|
300 | partfp.puts tseq[j][endpos[i]..startpos[i]].reverse |
---|
301 | else |
---|
302 | partfp.puts tseq[j][startpos[i]..endpos[i]] |
---|
303 | end |
---|
304 | end |
---|
305 | partfp.close |
---|
306 | |
---|
307 | if( realign[i] == 1 ) then |
---|
308 | STDERR.puts "Aligning region #{startpos[i]+1} - #{endpos[i]+1}" |
---|
309 | res = system "#{$MAFFTCOMMAND} #{options[i]} --inputorder --treein _tree _part > _partout" |
---|
310 | if res == false then |
---|
311 | STDERR.puts "\n" |
---|
312 | STDERR.puts "ERROR in aligning region #{startpos[i]+1} - #{endpos[i]+1}" |
---|
313 | STDERR.puts "Please check the option:" |
---|
314 | STDERR.puts "#{options[i]}" |
---|
315 | STDERR.puts "\n" |
---|
316 | exit 1 |
---|
317 | end |
---|
318 | |
---|
319 | else |
---|
320 | STDERR.puts "Copying region #{startpos[i]+1} - #{endpos[i]+1}" |
---|
321 | system "cp _part _partout" |
---|
322 | end |
---|
323 | |
---|
324 | pname = [] |
---|
325 | pseq = [] |
---|
326 | partfp = File.open( "_partout", "r" ) |
---|
327 | pin = readfasta( partfp, pname, pseq ) |
---|
328 | partfp.close |
---|
329 | for j in 0..(tin-1) |
---|
330 | outputseq[j] += pseq[j] |
---|
331 | end |
---|
332 | end |
---|
333 | |
---|
334 | for j in 0..(tin-1) |
---|
335 | puts ">" + tname[j] |
---|
336 | puts outputseq[j] |
---|
337 | end |
---|
338 | |
---|
339 | STDERR.puts "Done." |
---|
340 | |
---|
341 | numdupsites = checkmap.select{|x| x>1}.length |
---|
342 | if numdupsites > 0 then |
---|
343 | STDERR.puts "" |
---|
344 | STDERR.puts "#########################################################" |
---|
345 | STDERR.puts "# Warning: #{numdupsites} sites were duplicatedly selected." |
---|
346 | STDERR.puts "#########################################################" |
---|
347 | STDERR.puts "" |
---|
348 | end |
---|
349 | |
---|
350 | numunselectedsites = checkmap.select{|x| x==0}.length |
---|
351 | if numunselectedsites > 0 then |
---|
352 | STDERR.puts "" |
---|
353 | STDERR.puts "#########################################################" |
---|
354 | STDERR.puts "# Warning: #{numunselectedsites} sites were not selected." |
---|
355 | STDERR.puts "#########################################################" |
---|
356 | STDERR.puts "" |
---|
357 | end |
---|
358 | |
---|
359 | if revwarn == 1 then |
---|
360 | STDERR.puts "" |
---|
361 | STDERR.puts "#########################################################" |
---|
362 | STDERR.puts "# Warning: The order of sites were reversed." |
---|
363 | STDERR.puts "#########################################################" |
---|
364 | STDERR.puts "" |
---|
365 | end |
---|
366 | |
---|
367 | |
---|
368 | STDERR.puts "" |
---|
369 | STDERR.puts " Tree: computed with #{treeoption} --treeout " |
---|
370 | for i in 0..(numreg-1) |
---|
371 | range = sprintf( "%6d - %6d", startpos[i]+1, endpos[i]+1 ) |
---|
372 | if realign[i] == 1 then |
---|
373 | STDERR.puts "#{range}: realigned with #{options[i]} --treein (tree)" |
---|
374 | else |
---|
375 | STDERR.puts "#{range}: preserved" |
---|
376 | end |
---|
377 | end |
---|
378 | STDERR.puts "" |
---|
379 | |
---|
380 | File.delete( "_dum" ) |
---|
381 | File.delete( "_tree" ) |
---|
382 | File.delete( "_part" ) |
---|
383 | File.delete( "_partout" ) |
---|
384 | |
---|