source: branches/items/GDE/RAxML8/arb_raxml8.sh

Last change on this file was 15491, checked in by westram, 8 years ago
  • merge [15490] from 'warnings' into 'trunk'
    • argument quoting for arb_raxml8.sh. thx to Elmar.
File size: 14.7 KB
Line 
1#!/bin/bash
2set -e
3
4BASES_PER_THREAD=300
5SELF=`basename "$0"`
6
7# set up environment
8if [ -z "$ARB_LIBRARY_PATH" ]; then
9    # special handling for standalone use
10    if [ -z "$LD_LIBRARY_PATH" ]; then
11        LD_LIBRARY_PATH="$ARBHOME/lib"
12    else
13        LD_LIBRARY_PATH="$ARBHOME/lib:$LD_LIBRARY_PATH"
14    fi
15else
16    # normal arb integration; compare to ../../GDEHELP/ARB_GDEmenus.source@RUN_IN_WINDOW
17    LD_LIBRARY_PATH="${ARB_LIBRARY_PATH}"
18fi
19export LD_LIBRARY_PATH
20
21# always wait on exit
22# called at exit of script (by trap) and on error
23# e.g. if ctrl-c is pressed
24wait_and_exit() {
25    # do not recurse
26    trap EXIT
27
28    # kill any backgrounded processes
29    local JOBS=`jobs -r`
30    if [ -n "${JOBS}" ]; then
31        read -p "Press ENTER to terminate child processes and close window"
32        local JOBPIDS=`jobs -p`
33        kill ${JOBPIDS}
34    else
35        read -p "Press ENTER to close window"
36    fi
37    exit
38}
39
40on_exit() {
41    wait_and_exit
42}
43trap on_exit EXIT
44
45# return true if argument is file in path executable by user
46can_run() {
47    which "$1" &>/dev/null && test -x `which "$1" &>/dev/null`
48}
49
50# show error in ARB and exit
51report_error() {
52    echo "ARB ERROR: $*"
53    arb_message "$SELF failed with: $*"
54    wait_and_exit
55}
56
57# create named temporary directory
58prepare_tmp_dir() {
59    ARB_TMP="$HOME/.arb_tmp"
60    NAME="$1"
61    DATE=`date +%Y-%m-%d--%H-%M-%S`
62    for N in "" ".2" ".3" ".4"; do
63        DIR="$ARB_TMP/$NAME--$DATE$N"
64        if [ -e "$DIR" ]; then
65            continue;
66        else
67            mkdir -p "$DIR"
68            echo "$DIR"
69            return
70        fi
71    done
72    report_error "Unable to create temporary directory"
73}
74
75dump_features() {
76    case `uname` in
77        Darwin)
78            sysctl machdep.cpu.features
79            ;;
80        Linux)
81            grep -m1 flags /proc/cpuinfo 2>/dev/null
82            ;;
83    esac
84}
85cpu_has_feature() {
86    dump_features | grep -qi "$1" &>/dev/null
87}
88
89cpu_get_cores() {
90    # honor Torque/PBS num processes (or make sure we follow, if enforced)
91    if [ ! -z "$PBS_NP" ]; then
92        echo "$PBS_NP"
93        return
94    fi
95    # extract physical CPUs from host
96    case `uname` in
97        Darwin)
98            sysctl -n hw.ncpu
99            ;;
100        Linux)
101            grep -c "^processor" /proc/cpuinfo
102            ;;
103    esac
104}
105
106extract_line_suffix() {
107    local LOG="$1"
108    local PREFIX="$2"
109    perl -e "while (<>) { if (/^${PREFIX}\s*/) { print $'; } }" <"$LOG"
110}
111
112extract_likelihood() {
113    local LOG="$1"
114    local PREFIX="$2"
115    local SUFFIX="`extract_line_suffix $LOG $PREFIX`"
116    if [ -z "$SUFFIX" ]; then
117        local FAILED_DETECTION="failed to detect likelyhood"
118        echo $FAILED_DETECTION
119        arb_message "$SELF warning: $FAILED_DETECTION"
120    else
121        echo "$SUFFIX"
122    fi
123}
124
125TREEFILE=arb_export.tree
126
127export_input_tree() {
128    # expect user selected an 'Input tree' in arb and export it to $TREEFILE
129    if [ -z "$INPUTTREE" ]; then
130        report_error "you have to select an 'Input tree'"
131    fi
132
133    arb_export_tree "$INPUTTREE" > "$TREEFILE"
134}
135
136# --------------------------
137#      protocols helpers
138
139bootstrap_and_consenseIfReq() {
140    # run $BOOTSTRAP BS searches
141    $RAXML -b "$SEED" -m "$MODEL" -p "$SEED" -s "$SEQFILE" \
142        -N "$BOOTSTRAPS" \
143        -n BOOTSTRAP
144
145    if [ -n "$MRE" ]; then
146        # compute extended majority rule consensus tree
147        $RAXML -J MRE -m "$MODEL" -z RAxML_bootstrap.BOOTSTRAP -n BOOTSTRAP_CONSENSUS
148    fi
149}
150
151bootstrapAsyncIfRequested_and_wait() {
152    if [ "$BOOTSTRAPS" != "no" ]; then
153        if [ "$TRY_ASYNC" = "1" ]; then
154            if [ $(($THREADS * 2)) -gt $CORES ]; then
155            # wait for raxml (started by caller) to complete,
156            # if we have not enough cores to run bootstrap search at the same time
157                sleep 4 # just cosmetic (raxml output goes 1st)
158                echo "Note: Not enough cores found ($CORES) to run ML search and"
159                echo "      BS in parallel with $THREADS threads. Waiting..."
160                wait
161            fi
162        else
163            # otherwise always sync silently
164            wait
165        fi
166        bootstrap_and_consenseIfReq &
167    fi
168    wait # for all jobs
169}
170
171import_trees() {
172    local TPREFIX="$1"
173    local RUN="$2"
174    local COMMENT="$3"
175
176    local MAINTREE="${TPREFIX}.${RUN}"
177    # imports tree MAINTREE
178    # - with support values (if bootstrapping requested)
179    # - else "as is"
180    # imports MRE tree (if requested)
181
182    if [ "$BOOTSTRAPS" != "no" ]; then
183        # draw bipartition information
184        $RAXML -f b -m "$MODEL" \
185          -t "${TPREFIX}.${RUN}" \
186          -z RAxML_bootstrap.BOOTSTRAP \
187          -n "${RUN}_WITH_SUPPORT"
188
189        MAINTREE="RAxML_bipartitions.${RUN}_WITH_SUPPORT"
190        COMMENT="${COMMENT} BOOTSTRAPS=${BOOTSTRAPS}"
191    fi
192
193    arb_read_tree "${TREENAME}" "${MAINTREE}" "${COMMENT}"
194
195    if [ -n "$MRE" ]; then
196        if [ "$BOOTSTRAPS" != "no" ]; then
197            # otherwise no MRE tree possible
198            arb_read_tree "${TREENAME}_mre" RAxML_MajorityRuleExtendedConsensusTree.BOOTSTRAP_CONSENSUS \
199              "PRG=RAxML8 MRE consensus tree of $BOOTSTRAPS bootstrap searches performed for species in ${TREENAME}"
200        fi
201    fi
202}
203
204# -------------------
205#      protocols
206
207dna_tree_thorough() {
208    # do $REPEATS searches for best ML tree
209    $RAXML -f d -m "$MODEL" -p "$SEED" -s "$SEQFILE"  \
210        -N "$REPEATS" \
211        -n TREE_INFERENCE &
212
213    bootstrapAsyncIfRequested_and_wait
214
215    LIKELIHOOD=`extract_likelihood RAxML_info.TREE_INFERENCE 'Final\s*GAMMA-based\s*Score\s*of\s*best\s*tree'`
216    import_trees RAxML_bestTree TREE_INFERENCE "PRG=RAxML8 FILTER=$FILTER DIST=$MODEL LIKELIHOOD=${LIKELIHOOD} PROTOCOL=thorough"
217}
218
219dna_tree_quick() {
220    if [ "$BOOTSTRAPS" = "no" ]; then
221        report_error You have to select the number of bootstraps to perform
222    fi
223
224    # run fast bootstraps
225    $RAXML -f a -m "$MODEL" -p "$SEED" -x "$SEED" -s "$SEQFILE" \
226      -N "$BOOTSTRAPS" \
227      -n FAST_BS
228
229    # import
230    LIKELIHOOD=`extract_likelihood RAxML_info.FAST_BS 'Final\s*ML\s*Optimization\s*Likelihood:'`
231    arb_read_tree ${TREENAME} RAxML_bipartitions.FAST_BS "PRG=RAxML8 FILTER=$FILTER DIST=$MODEL LIKELIHOOD=${LIKELIHOOD} PROTOCOL=quick"
232
233    # create consensus tree
234    if [ -n "$MRE" ]; then
235        $RAXML -J MRE -m "$MODEL" -z RAxML_bootstrap.FAST_BS -n FAST_BS_MAJORITY
236        # import
237        arb_read_tree ${TREENAME}_mre RAxML_MajorityRuleExtendedConsensusTree.FAST_BS_MAJORITY \
238          "PRG=RAxML8 MRE consensus tree of $BOOTSTRAPS rapid-bootstraps performed while calculating ${TREENAME}"
239    fi
240}   
241
242dna_tree_add() {
243    export_input_tree
244
245    $RAXML -f d -m "$MODEL" -p "$SEED" -s "$SEQFILE" \
246      -g "$TREEFILE" \
247      -n ADD &
248
249    bootstrapAsyncIfRequested_and_wait
250
251    LIKELIHOOD=`extract_likelihood RAxML_info.ADD 'Final\s*GAMMA-based\s*Score\s*of\s*best\s*tree'`
252    import_trees RAxML_bestTree ADD "PRG=RAxML8 FILTER=$FILTER DIST=$MODEL LIKELIHOOD=${LIKELIHOOD} PROTOCOL=add INPUTTREE=$INPUTTREE"
253}
254
255dna_tree_optimize() {
256    export_input_tree
257
258    $RAXML -f t -m "$MODEL" -p "$SEED" -s "$SEQFILE" \
259      -N "$REPEATS" \
260      -t "$TREEFILE" \
261      -n OPTIMIZE &
262
263    bootstrapAsyncIfRequested_and_wait
264
265    LIKELIHOOD=`extract_likelihood RAxML_info.OPTIMIZE 'Final\s*GAMMA-based\s*Score\s*of\s*best\s*tree'`
266    import_trees RAxML_bestTree OPTIMIZE "PRG=RAxML8 FILTER=$FILTER DIST=$MODEL LIKELIHOOD=${LIKELIHOOD} PROTOCOL=optimize INPUTTREE=$INPUTTREE"
267}
268
269dna_tree_calcblen() {
270    export_input_tree
271
272    $RAXML -f e -m "$MODEL" -s "$SEQFILE" \
273      -t "$TREEFILE" \
274      -n CALCBLEN &
275
276    bootstrapAsyncIfRequested_and_wait
277
278    LIKELIHOOD=`extract_likelihood RAxML_info.CALCBLEN 'Final\s*GAMMA\s*likelihood:'`
279    import_trees RAxML_result CALCBLEN "PRG=RAxML8 FILTER=$FILTER DIST=$MODEL LIKELIHOOD=${LIKELIHOOD} PROTOCOL=calcblen INPUTTREE=$INPUTTREE"
280}
281
282dna_tree_bootstrap() {
283    if [ "$BOOTSTRAPS" = "no" ]; then
284        report_error You have to select the number of bootstraps to perform
285    fi
286
287    export_input_tree
288    bootstrapAsyncIfRequested_and_wait
289    import_trees arb_export tree "PRG=RAxML8 FILTER=$FILTER DIST=$MODEL PROTOCOL=bootstrap INPUTTREE=$INPUTTREE"
290}
291
292dna_tree_score() {
293    export_input_tree
294
295    $RAXML -f n -m $MODEL -s "$SEQFILE" \
296      -z "$TREEFILE" \
297      -n SCORE
298
299    RESULT=`extract_likelihood RAxML_info.SCORE 'Tree\s*0\s*Likelihood'`
300    # RESULT contains sth like: -27819.642837 Tree-Length 6.899222
301    LIKELIHOOD=${RESULT/ Tree-Length */}
302    TREELEN=${RESULT/* Tree-Length /}
303
304    arb_write_tree_comment $INPUTTREE "RAxML8-score: FILTER=$FILTER DIST=$MODEL LIKELIHOOD=$LIKELIHOOD TREELEN=$TREELEN"
305}
306
307# --------------
308#      main     
309
310MRE=Y
311TREENAME=raxml
312FILTER=unknown
313INPUTTREE=
314
315while [ -n "$1" ]; do 
316  case "$1" in
317      -p) # protocol
318          PROTOCOL="$2"
319          shift
320          ;;
321      -m) # subst model
322          MODEL="$2"
323          shift
324          ;;
325      -s) # random seed
326          SEED="$2"
327          shift
328          ;;
329      -b) # bootstraps
330          BOOTSTRAPS="$2"
331          shift
332          ;;
333      -r) # number of tree searches
334          REPEATS="$2"
335          shift
336          ;;
337      -n) # tree name
338          TREENAME="$2"
339          shift
340          ;;
341      -nomre) # don't create mre tree
342          MRE=
343          ;;
344      -nt) # seqtype dna
345          SEQTYPE=N
346          ;;
347      -aa) # seqtype proto
348          SEQTYPE=A
349          ;;
350      -f) # input file
351          FILE="$2"
352          shift
353          ;;
354      -t) # threads
355          THREADS="$2"
356          shift
357          ;;
358      -it) # inputtree
359          INPUTTREE="$2"
360          if [ "$INPUTTREE" = 'tree_?????' ]; then # has to match ../../TEMPLATES/arb_global_defs.h@NO_TREE_SELECTED
361              INPUTTREE=
362          fi
363          shift
364          ;;
365      -fi) # filtername for comment
366          FILTER="$2"
367          shift
368          ;;
369      *)
370          report_error argument not understood: "$1"
371          ;;
372  esac
373  shift
374done
375
376# correct output treename (ensure prefix 'tree_', avoid things like 'tree_tree' etc)
377TREENAME="${TREENAME##tree}"
378TREENAME="${TREENAME#_}"
379TREENAME="${TREENAME#_}"
380TREENAME="tree_${TREENAME}"
381
382# use time as random SEED if empty
383if [ -z "$SEED" ]; then
384    # seconds since 1970
385    SEED=`date +%s`
386fi
387
388# prepare data in tempdir
389DIR="`prepare_tmp_dir raxml`"
390SEQFILE="dna.phy"
391FULLSEQFILE="${DIR}/${SEQFILE}"
392
393arb_convert_aln --arb-notify -GenBank "$FILE" -phylip "${FULLSEQFILE}" 2>&1 |\
394  grep -v "^WARNING(14): LOCUS" || true # remove spurious warning
395rm "$FILE"
396
397cd "$DIR"
398
399# decide whether async execution of BS and main algo makes sense (ie. runtimes of both are expected similar)
400TRY_ASYNC=0
401case "${PROTOCOL}" in
402    (thorough|optimize)
403        if [ "$BOOTSTRAPS" != "no" ]; then
404            TRY_ASYNC=1
405        fi
406        ;;
407esac
408
409# select RAxML binary
410if cpu_has_feature avx && can_run raxmlHPC8-AVX.PTHREADS; then
411    RAXML=raxmlHPC8-AVX.PTHREADS
412elif cpu_has_feature sse3 && can_run raxmlHPC8-SSE3.PTHREADS; then
413    RAXML=raxmlHPC8-SSE3.PTHREADS
414elif can_run raxmlHPC8-PTHREADS; then
415    RAXML=raxmlHPC8-PTHREADS
416else
417    report_error Could not find working RAxML binary.
418fi
419
420# get some numbers
421CORES=`cpu_get_cores`
422read NSEQS BP_ARB < <(head -n 1 $SEQFILE)
423
424# retrieve number of alignment patterns recognized by RAxML
425$RAXML -T 2 -f j -s "$SEQFILE" -b 123 -N 1 -m "$MODEL" -n PATTERNS
426BP=`extract_line_suffix RAxML_info.PATTERNS "Alignment Patterns:"`
427
428# warn if model is not recommended for given number of sequences
429BAD_PRACTICE="This is not considered good practice.\nPlease refer to the RAxML manual for details."
430if [ "$MODEL" == "GTRGAMMA" -a "$NSEQS" -gt 10000 ]; then
431    arb_message "Using the GTRGAMMA model on more than 10,000 sequences.\n$BAD_PRACTICE"
432fi
433if [ "$MODEL" == "GTRCAT" -a "$NSEQS" -lt 150 ]; then
434    arb_message "Using the GTRCAT model on less than 150 sequences.\n$BAD_PRACTICE"
435fi
436
437# -----------------------------------
438#      threads / cores detection     
439
440CORES=$(( $CORES + 1 - 1 ))
441# calculate number of threads (if not passed)
442MAX_THREADS=$(( ( $BP - 1 ) / $BASES_PER_THREAD + 2))
443# +1 is for master thread,
444# another +1 for the first $BASES_PER_THREAD (bash truncates); -1 to avoid extra thread if BP is divisible by BASES_PER_THREAD
445
446if [ "$CORES" -lt 1 ]; then
447    # failed to detect CORES
448    SETENVAR_HINT="set the environment variable PBS_NP to the number of cores available (before you start arb)"
449    if [ -z "$THREADS" ]; then
450        report_error "failed to detect number of cores.\nPlease specify 'CPU thread override' or\n${SETENVAR_HINT}."
451    else
452        CORES=$THREADS
453        if [ "$TRY_ASYNC" = "1" ]; then
454            echo "Warning: failed to detect number of cores (assuming ${CORES} from 'CPU thread override')"
455            echo "Please ${SETENVAR_HINT}."
456        else
457            arb_message "Warning: failed to detect number of cores\n=> parallel bootstrapping disabled\nPlease ${SETENVAR_HINT} to avoid that."
458        fi
459    fi
460else
461    MAX_TH_NOTE="maximum useful thread-count for alignment with ${BP} bp would be ${MAX_THREADS}"
462    if [ -z "$THREADS" ]; then
463        if [ "$CORES" -lt "$MAX_THREADS" ]; then
464            echo "Note: Limiting threads to $CORES available cores (${MAX_TH_NOTE})"
465            THREADS=$CORES
466        else
467            THREADS=$MAX_THREADS
468            if [ "$TRY_ASYNC" = "1" ]; then
469                if [ "$(($THREADS * 2))" -gt "$CORES" ]; then
470                    # split threads between BS and ML search
471                    if [ "$((($THREADS-1) * 2))" -le "$CORES" ]; then
472                        THREADS=$(($THREADS-1))
473                    else
474                        if [ "$((($THREADS-2) * 2))" -le "$CORES" ]; then
475                            THREADS=$(($THREADS-2))
476                        fi
477                    fi
478                fi
479                if [ "$THREADS" -lt "$MAX_THREADS" ]; then
480                    arb_message "Note: reduced threads to $THREADS to allow parallel execution of bootstrapping\nset 'CPU thread override' to ${MAX_THREADS} to avoid that"
481                fi
482            fi
483        fi
484        if [ "$THREADS" -lt 2 ]; then
485            # use at least 2 threads (required by PTHREADS version)
486            THREADS=2
487        fi
488    else
489        echo "Note: Threads forced to ${THREADS} (${MAX_TH_NOTE})"
490    fi
491fi
492
493if [ "$CORES" -lt "$THREADS" ]; then
494    arb_message "Performance-Warning: Using $THREADS threads on $CORES cores"
495fi
496
497RAXML="$RAXML -T $THREADS"
498
499case "${SEQTYPE}.${PROTOCOL}" in
500    N.thorough)
501        time dna_tree_thorough
502        ;;
503    N.quick)
504        time dna_tree_quick
505        ;;
506    N.add)
507        time dna_tree_add
508        ;;
509    N.optimize)
510        time dna_tree_optimize
511        ;;
512    N.calcblen)
513        time dna_tree_calcblen
514        ;;
515    N.bootstrap)
516        time dna_tree_bootstrap
517        ;;
518    N.score)
519        time dna_tree_score
520        ;;
521    *)
522        report_error Unknown protocol "${SEQTYPE}.${PROTOCOL}"
523        ;;
524esac
525
526# @@@ FIXME: cleanup temp dir
527
528exit
Note: See TracBrowser for help on using the repository browser.