| 1 | #! /usr/bin/env ruby |
|---|
| 2 | |
|---|
| 3 | $MAFFTCOMMAND = '"/usr/local/bin/mafft"' |
|---|
| 4 | # Edit the above line to specify the location of mafft. |
|---|
| 5 | # $MAFFTCOMMAND = '"C:\folder name\mafft.bat"' # windows |
|---|
| 6 | # $MAFFTCOMMAND = '"/usr/local/bin/mafft"' # mac or cygwin |
|---|
| 7 | # $MAFFTCOMMAND = '"/usr/bin/mafft"' # linux (rpm) |
|---|
| 8 | # $MAFFTCOMMAND = '"/somewhere/mafft.bat"' # all-in-one version for linux or mac |
|---|
| 9 | |
|---|
| 10 | ##################################################################### |
|---|
| 11 | # |
|---|
| 12 | # regionalrealignment.rb version 0.1 (2013/Jul/12) |
|---|
| 13 | # ruby regionalrealignment.rb setting input > output |
|---|
| 14 | # See http://mafft.cbrc.jp/alignment/software/regionalrealignment.html |
|---|
| 15 | # |
|---|
| 16 | ##################################################################### |
|---|
| 17 | |
|---|
| 18 | |
|---|
| 19 | def readfasta( fp, name, seq ) |
|---|
| 20 | nseq = 0 |
|---|
| 21 | tmpseq = "" |
|---|
| 22 | while fp.gets |
|---|
| 23 | if $_ =~ /^>/ then |
|---|
| 24 | name.push( $_.sub(/>/,"").strip ) |
|---|
| 25 | seq.push( tmpseq ) if nseq > 0 |
|---|
| 26 | nseq += 1 |
|---|
| 27 | tmpseq = "" |
|---|
| 28 | else |
|---|
| 29 | tmpseq += $_.strip |
|---|
| 30 | end |
|---|
| 31 | end |
|---|
| 32 | seq.push( tmpseq ) |
|---|
| 33 | return nseq |
|---|
| 34 | end |
|---|
| 35 | |
|---|
| 36 | def resolve( tree ) |
|---|
| 37 | while 1 |
|---|
| 38 | # p tree |
|---|
| 39 | tree.sub!( /\,([0-9]+):(\-?[0-9\.]+)\,([0-9]+):(\-?[0-9\.]+)/, ",XXX" ) |
|---|
| 40 | hit1 = $1 |
|---|
| 41 | hit2 = $2 |
|---|
| 42 | hit3 = $3 |
|---|
| 43 | hit4 = $4 |
|---|
| 44 | |
|---|
| 45 | # p hit1 |
|---|
| 46 | # p hit2 |
|---|
| 47 | # p hit3 |
|---|
| 48 | # p hit4 |
|---|
| 49 | |
|---|
| 50 | # puts "introduce XXX" |
|---|
| 51 | # p tree |
|---|
| 52 | |
|---|
| 53 | break unless tree.index(/XXX/) |
|---|
| 54 | |
|---|
| 55 | poshit = tree.index(/XXX/) |
|---|
| 56 | # puts "poshit=" + poshit.to_s |
|---|
| 57 | |
|---|
| 58 | i = poshit |
|---|
| 59 | height = 0 |
|---|
| 60 | while i >= 0 |
|---|
| 61 | break if height == 0 && tree[i..i] == '(' |
|---|
| 62 | if tree[i..i] == ')' then |
|---|
| 63 | height += 1 |
|---|
| 64 | elsif tree[i..i] == '(' then |
|---|
| 65 | height -= 1 |
|---|
| 66 | end |
|---|
| 67 | i -= 1 |
|---|
| 68 | end |
|---|
| 69 | |
|---|
| 70 | poskakko = i |
|---|
| 71 | # puts "poskakko = " + poskakko.to_s |
|---|
| 72 | zenhan = tree[0..poskakko] |
|---|
| 73 | zenhan = "" if poskakko == -1 |
|---|
| 74 | # puts "zenhan = " + zenhan |
|---|
| 75 | |
|---|
| 76 | treelen = tree.length |
|---|
| 77 | tree = zenhan + "(" + tree[poskakko+1..treelen] |
|---|
| 78 | # puts "add (" |
|---|
| 79 | # p tree |
|---|
| 80 | tree.sub!( /XXX/, "#{hit1}:#{hit2}):0,#{hit3}:#{hit4}" ) |
|---|
| 81 | |
|---|
| 82 | # p tree |
|---|
| 83 | end |
|---|
| 84 | |
|---|
| 85 | |
|---|
| 86 | return tree |
|---|
| 87 | |
|---|
| 88 | end |
|---|
| 89 | |
|---|
| 90 | if ARGV.length != 2 then |
|---|
| 91 | STDERR.puts "" |
|---|
| 92 | STDERR.puts "Usage: ruby #{$0} setingfile inputfile > output" |
|---|
| 93 | STDERR.puts "" |
|---|
| 94 | exit 1 |
|---|
| 95 | end |
|---|
| 96 | |
|---|
| 97 | infilename = ARGV[1] |
|---|
| 98 | tname = [] |
|---|
| 99 | tseq = [] |
|---|
| 100 | infp = File.open( infilename, "r" ) |
|---|
| 101 | tin = readfasta( infp, tname, tseq ) |
|---|
| 102 | infp.close |
|---|
| 103 | |
|---|
| 104 | if tin == 0 then |
|---|
| 105 | STDERR.puts "" |
|---|
| 106 | STDERR.puts "Error in the '#{infilename}' file. Is this FASTA format?\n" |
|---|
| 107 | STDERR.puts "" |
|---|
| 108 | exit 1 |
|---|
| 109 | end |
|---|
| 110 | |
|---|
| 111 | alnlen = tseq[0].length |
|---|
| 112 | if alnlen == 0 then |
|---|
| 113 | STDERR.puts "" |
|---|
| 114 | STDERR.puts "Error in the '#{infilename}' file. Is this FASTA format?\n" |
|---|
| 115 | STDERR.puts "" |
|---|
| 116 | exit 1 |
|---|
| 117 | end |
|---|
| 118 | |
|---|
| 119 | |
|---|
| 120 | for i in 0..(tin-1) |
|---|
| 121 | if alnlen != tseq[i].length then |
|---|
| 122 | STDERR.puts "" |
|---|
| 123 | STDERR.puts "Please insert gaps such that all the input sequences have the same length.\n" |
|---|
| 124 | STDERR.puts "" |
|---|
| 125 | exit 1 |
|---|
| 126 | end |
|---|
| 127 | end |
|---|
| 128 | |
|---|
| 129 | checkmap = [] |
|---|
| 130 | for i in 0..(alnlen-1) |
|---|
| 131 | checkmap.push(0) |
|---|
| 132 | end |
|---|
| 133 | |
|---|
| 134 | outputseq = [] |
|---|
| 135 | for i in 0..(tin-1) |
|---|
| 136 | outputseq.push("") |
|---|
| 137 | end |
|---|
| 138 | |
|---|
| 139 | |
|---|
| 140 | settingfile = ARGV[0].to_s |
|---|
| 141 | reg = [] |
|---|
| 142 | startpos = [] |
|---|
| 143 | endpos = [] |
|---|
| 144 | realign = [] |
|---|
| 145 | options = [] |
|---|
| 146 | treeoption = "" |
|---|
| 147 | revwarn = 0 |
|---|
| 148 | sfp = File.open( settingfile, "r" ) |
|---|
| 149 | while line = sfp.gets |
|---|
| 150 | line.sub!(/#.*/,"") |
|---|
| 151 | next if line.length < 2 |
|---|
| 152 | if line.strip =~ /^treeoption / then |
|---|
| 153 | treeoption = line.strip.sub(/.*treeoption/,"") |
|---|
| 154 | break |
|---|
| 155 | end |
|---|
| 156 | end |
|---|
| 157 | sfp.close |
|---|
| 158 | sfp = File.open( settingfile, "r" ) |
|---|
| 159 | while line = sfp.gets |
|---|
| 160 | line.sub!(/#.*/,"") |
|---|
| 161 | next if line.length < 2 |
|---|
| 162 | next if line.strip =~ /^treeoption/ |
|---|
| 163 | startposv = line.split(' ')[0].to_i - 1 |
|---|
| 164 | endposv = line.split(' ')[1].to_i - 1 |
|---|
| 165 | if startposv < 0 || endposv < 0 then |
|---|
| 166 | STDERR.puts "\nError in the '#{settingfile}' file. Please check this line:\n" |
|---|
| 167 | STDERR.puts line |
|---|
| 168 | STDERR.puts "Sites must be numbered as 1, 2, ...\n" |
|---|
| 169 | STDERR.puts "\n" |
|---|
| 170 | exit 1 |
|---|
| 171 | end |
|---|
| 172 | if startposv >= alnlen || endposv >= alnlen then |
|---|
| 173 | STDERR.puts "\nError in the '#{settingfile}' file. Please check this line:\n" |
|---|
| 174 | STDERR.puts line |
|---|
| 175 | STDERR.puts "Sites must be numbered as 1, 2, ... #{alnlen}\n" |
|---|
| 176 | STDERR.puts "\n" |
|---|
| 177 | exit 1 |
|---|
| 178 | end |
|---|
| 179 | if startposv > endposv then |
|---|
| 180 | STDERR.puts "\nWarning. Please check this line:\n" |
|---|
| 181 | STDERR.puts line |
|---|
| 182 | STDERR.puts "Start position > End position ?\n" |
|---|
| 183 | STDERR.puts "\n" |
|---|
| 184 | revwarn = 1 |
|---|
| 185 | # exit 1 |
|---|
| 186 | end |
|---|
| 187 | startpos.push( startposv ) |
|---|
| 188 | endpos.push( endposv ) |
|---|
| 189 | if startposv > endposv |
|---|
| 190 | for k in (endposv)..(startposv) |
|---|
| 191 | checkmap[k] += 1 |
|---|
| 192 | end |
|---|
| 193 | else |
|---|
| 194 | for k in (startposv)..(endposv) |
|---|
| 195 | checkmap[k] += 1 |
|---|
| 196 | end |
|---|
| 197 | end |
|---|
| 198 | if line.split(' ')[2] == "realign" then |
|---|
| 199 | realign.push( 1 ) |
|---|
| 200 | elsif line.split(' ')[2] == "preserve" then |
|---|
| 201 | realign.push( 0 ) |
|---|
| 202 | else |
|---|
| 203 | STDERR.puts "\n" |
|---|
| 204 | STDERR.puts "The third column must be 'realign' or 'preserve'\n" |
|---|
| 205 | STDERR.puts "Please check this line:\n" |
|---|
| 206 | STDERR.puts line |
|---|
| 207 | STDERR.puts "\n" |
|---|
| 208 | exit 1 |
|---|
| 209 | end |
|---|
| 210 | if line =~ / \-\-/ && line =~ /realign/ then |
|---|
| 211 | options.push( line.sub(/.*realign/,"").strip ) |
|---|
| 212 | else |
|---|
| 213 | options.push( treeoption ) |
|---|
| 214 | end |
|---|
| 215 | end |
|---|
| 216 | sfp.close |
|---|
| 217 | |
|---|
| 218 | #p startpos |
|---|
| 219 | #p endpos |
|---|
| 220 | #p options |
|---|
| 221 | |
|---|
| 222 | |
|---|
| 223 | #res = system "#{$MAFFTCOMMAND} #{treeoption} --treeout --retree 0 --thread -1 #{infilename} > _dum" |
|---|
| 224 | res = system "#{$MAFFTCOMMAND} #{treeoption} --treeout --retree 0 #{infilename} > _dum" |
|---|
| 225 | |
|---|
| 226 | if res == false then |
|---|
| 227 | STDERR.puts "\n" |
|---|
| 228 | STDERR.puts "ERROR in building a guide tree" |
|---|
| 229 | STDERR.puts "\n" |
|---|
| 230 | exit 1 |
|---|
| 231 | end |
|---|
| 232 | |
|---|
| 233 | treefp = File.open( "#{infilename}.tree", "r" ) |
|---|
| 234 | |
|---|
| 235 | tree = "" |
|---|
| 236 | while line = treefp.gets |
|---|
| 237 | tree += line.strip |
|---|
| 238 | break if tree =~ /;$/ |
|---|
| 239 | end |
|---|
| 240 | treefp.close |
|---|
| 241 | |
|---|
| 242 | tree = tree.gsub( /_.*?:/, ":" ).gsub(/[0-9]\.[0-9]*e-[0-9][0-9]/, "0").gsub(/\[.*?\]/,"").gsub(/ /, "") |
|---|
| 243 | scale = 1.0 |
|---|
| 244 | mtreefp = File.open("_tree", "w") |
|---|
| 245 | |
|---|
| 246 | |
|---|
| 247 | STDERR.puts "Tree = " + tree |
|---|
| 248 | |
|---|
| 249 | memi = [-1,-1] |
|---|
| 250 | leni = [-1,-1] |
|---|
| 251 | |
|---|
| 252 | while tree.index( /\(/ ) |
|---|
| 253 | |
|---|
| 254 | tree = resolve( tree ) |
|---|
| 255 | |
|---|
| 256 | tree.sub!( /\(([0-9]+):(\-?[0-9\.]+),([0-9]+):(\-?[0-9\.]+)\)/, "XXX" ) |
|---|
| 257 | memi[0] = $1.to_i |
|---|
| 258 | leni[0] = $2.to_f * scale |
|---|
| 259 | memi[1] = $3.to_i |
|---|
| 260 | leni[1] = $4.to_f * scale |
|---|
| 261 | |
|---|
| 262 | if leni[0] > 10 || leni[1] > 10 then |
|---|
| 263 | STDERR.puts "" |
|---|
| 264 | STDERR.puts "Please check the scale of branch length!" |
|---|
| 265 | STDERR.puts "The unit of branch lengths must be 'substitution/site'" |
|---|
| 266 | STDERR.puts "If the unit is 'substition' in your tree, please" |
|---|
| 267 | STDERR.puts "use the scale argument," |
|---|
| 268 | STDERR.puts "% newick2mafft scale in > out" |
|---|
| 269 | STDERR.puts "where scale = 1/(alignment length)" |
|---|
| 270 | STDERR.puts "" |
|---|
| 271 | exit 1 |
|---|
| 272 | end |
|---|
| 273 | |
|---|
| 274 | # STDERR.puts "subtree = " + $& |
|---|
| 275 | |
|---|
| 276 | if memi[1] < memi[0] then |
|---|
| 277 | memi.reverse! |
|---|
| 278 | leni.reverse! |
|---|
| 279 | end |
|---|
| 280 | |
|---|
| 281 | tree.sub!( /XXX/, memi[0].to_s ) |
|---|
| 282 | |
|---|
| 283 | # STDERR.puts "Tree = " + tree |
|---|
| 284 | |
|---|
| 285 | mtreefp.printf( "%5d %5d %10.5f %10.5f\n", memi[0], memi[1], leni[0], leni[1] ) |
|---|
| 286 | |
|---|
| 287 | end |
|---|
| 288 | |
|---|
| 289 | |
|---|
| 290 | mtreefp.close |
|---|
| 291 | |
|---|
| 292 | numreg = startpos.length |
|---|
| 293 | |
|---|
| 294 | for i in 0..(numreg-1) |
|---|
| 295 | |
|---|
| 296 | partfp = File.open( "_part", "w" ) |
|---|
| 297 | for j in 0..(tin-1) |
|---|
| 298 | partfp.puts ">" + tname[j] |
|---|
| 299 | if startpos[i] > endpos[i] then |
|---|
| 300 | partfp.puts tseq[j][endpos[i]..startpos[i]].reverse |
|---|
| 301 | else |
|---|
| 302 | partfp.puts tseq[j][startpos[i]..endpos[i]] |
|---|
| 303 | end |
|---|
| 304 | end |
|---|
| 305 | partfp.close |
|---|
| 306 | |
|---|
| 307 | if( realign[i] == 1 ) then |
|---|
| 308 | STDERR.puts "Aligning region #{startpos[i]+1} - #{endpos[i]+1}" |
|---|
| 309 | res = system "#{$MAFFTCOMMAND} #{options[i]} --inputorder --treein _tree _part > _partout" |
|---|
| 310 | if res == false then |
|---|
| 311 | STDERR.puts "\n" |
|---|
| 312 | STDERR.puts "ERROR in aligning region #{startpos[i]+1} - #{endpos[i]+1}" |
|---|
| 313 | STDERR.puts "Please check the option:" |
|---|
| 314 | STDERR.puts "#{options[i]}" |
|---|
| 315 | STDERR.puts "\n" |
|---|
| 316 | exit 1 |
|---|
| 317 | end |
|---|
| 318 | |
|---|
| 319 | else |
|---|
| 320 | STDERR.puts "Copying region #{startpos[i]+1} - #{endpos[i]+1}" |
|---|
| 321 | system "cp _part _partout" |
|---|
| 322 | end |
|---|
| 323 | |
|---|
| 324 | pname = [] |
|---|
| 325 | pseq = [] |
|---|
| 326 | partfp = File.open( "_partout", "r" ) |
|---|
| 327 | pin = readfasta( partfp, pname, pseq ) |
|---|
| 328 | partfp.close |
|---|
| 329 | for j in 0..(tin-1) |
|---|
| 330 | outputseq[j] += pseq[j] |
|---|
| 331 | end |
|---|
| 332 | end |
|---|
| 333 | |
|---|
| 334 | for j in 0..(tin-1) |
|---|
| 335 | puts ">" + tname[j] |
|---|
| 336 | puts outputseq[j] |
|---|
| 337 | end |
|---|
| 338 | |
|---|
| 339 | STDERR.puts "Done." |
|---|
| 340 | |
|---|
| 341 | numdupsites = checkmap.select{|x| x>1}.length |
|---|
| 342 | if numdupsites > 0 then |
|---|
| 343 | STDERR.puts "" |
|---|
| 344 | STDERR.puts "#########################################################" |
|---|
| 345 | STDERR.puts "# Warning: #{numdupsites} sites were duplicatedly selected." |
|---|
| 346 | STDERR.puts "#########################################################" |
|---|
| 347 | STDERR.puts "" |
|---|
| 348 | end |
|---|
| 349 | |
|---|
| 350 | numunselectedsites = checkmap.select{|x| x==0}.length |
|---|
| 351 | if numunselectedsites > 0 then |
|---|
| 352 | STDERR.puts "" |
|---|
| 353 | STDERR.puts "#########################################################" |
|---|
| 354 | STDERR.puts "# Warning: #{numunselectedsites} sites were not selected." |
|---|
| 355 | STDERR.puts "#########################################################" |
|---|
| 356 | STDERR.puts "" |
|---|
| 357 | end |
|---|
| 358 | |
|---|
| 359 | if revwarn == 1 then |
|---|
| 360 | STDERR.puts "" |
|---|
| 361 | STDERR.puts "#########################################################" |
|---|
| 362 | STDERR.puts "# Warning: The order of sites were reversed." |
|---|
| 363 | STDERR.puts "#########################################################" |
|---|
| 364 | STDERR.puts "" |
|---|
| 365 | end |
|---|
| 366 | |
|---|
| 367 | |
|---|
| 368 | STDERR.puts "" |
|---|
| 369 | STDERR.puts " Tree: computed with #{treeoption} --treeout " |
|---|
| 370 | for i in 0..(numreg-1) |
|---|
| 371 | range = sprintf( "%6d - %6d", startpos[i]+1, endpos[i]+1 ) |
|---|
| 372 | if realign[i] == 1 then |
|---|
| 373 | STDERR.puts "#{range}: realigned with #{options[i]} --treein (tree)" |
|---|
| 374 | else |
|---|
| 375 | STDERR.puts "#{range}: preserved" |
|---|
| 376 | end |
|---|
| 377 | end |
|---|
| 378 | STDERR.puts "" |
|---|
| 379 | |
|---|
| 380 | File.delete( "_dum" ) |
|---|
| 381 | File.delete( "_tree" ) |
|---|
| 382 | File.delete( "_part" ) |
|---|
| 383 | File.delete( "_partout" ) |
|---|
| 384 | |
|---|