source: branches/ali/GDE/RAxML8/arb_raxml8.sh

Last change on this file was 19234, checked in by westram, 2 years ago
File size: 14.3 KB
Line 
1#!/bin/bash
2set -e
3
4BASES_PER_THREAD=300
5SELF=`basename "$0"`
6
7# set up environment
8if [ -z "$ARB_LIBRARY_PATH" ]; then
9    # special handling for standalone use
10    if [ -z "$LD_LIBRARY_PATH" ]; then
11        LD_LIBRARY_PATH="$ARBHOME/lib"
12    else
13        LD_LIBRARY_PATH="$ARBHOME/lib:$LD_LIBRARY_PATH"
14    fi
15else
16    # normal arb integration; compare to ../../GDEHELP/ARB_GDEmenus.source@RUN_IN_WINDOW
17    LD_LIBRARY_PATH="${ARB_LIBRARY_PATH}"
18fi
19export LD_LIBRARY_PATH
20
21# always wait on exit
22# called at exit of script (by trap) and on error
23# e.g. if ctrl-c is pressed
24wait_and_exit() {
25    # do not recurse
26    trap EXIT
27
28    # kill any backgrounded processes
29    local JOBS=`jobs -r`
30    if [ -n "${JOBS}" ]; then
31        read -p "Press ENTER to terminate child processes"
32        local JOBPIDS=`jobs -p`
33        kill ${JOBPIDS}
34    fi
35    # no extra wait here (caller already waits)
36    exit
37}
38
39on_exit() {
40    wait_and_exit
41}
42trap on_exit EXIT
43
44# return true if argument is file in path executable by user
45can_run() {
46    which "$1" &>/dev/null && test -x `which "$1" &>/dev/null`
47}
48
49# show error in ARB and exit
50report_error() {
51    echo "ARB ERROR: $*"
52    arb_message "$SELF failed with: $*"
53    wait_and_exit
54}
55
56dump_features() {
57    case `uname` in
58        Darwin)
59            sysctl machdep.cpu.features
60            ;;
61        Linux)
62            grep -m1 flags /proc/cpuinfo 2>/dev/null
63            ;;
64    esac
65}
66cpu_has_feature() {
67    dump_features | grep -qi "$1" &>/dev/null
68}
69
70cpu_get_cores() {
71    # honor Torque/PBS num processes (or make sure we follow, if enforced)
72    if [ ! -z "$PBS_NP" ]; then
73        echo "$PBS_NP"
74        return
75    fi
76    # extract physical CPUs from host
77    case `uname` in
78        Darwin)
79            sysctl -n hw.ncpu
80            ;;
81        Linux)
82            grep -c "^processor" /proc/cpuinfo
83            ;;
84    esac
85}
86
87extract_line_suffix() {
88    local LOG="$1"
89    local PREFIX="$2"
90    perl -e "while (<>) { if (/^${PREFIX}\s*/) { print $'; } }" <"$LOG"
91}
92
93extract_likelihood() {
94    local LOG="$1"
95    local PREFIX="$2"
96    local SUFFIX="`extract_line_suffix $LOG $PREFIX`"
97    if [ -z "$SUFFIX" ]; then
98        local FAILED_DETECTION="failed to detect likelyhood"
99        echo $FAILED_DETECTION
100        arb_message "$SELF warning: $FAILED_DETECTION"
101    else
102        echo "$SUFFIX"
103    fi
104}
105
106TREEFILE=arb_export.tree
107
108export_input_tree() {
109    # expect user selected an 'Input tree' in arb and export it to $TREEFILE
110    if [ -z "$INPUTTREE" ]; then
111        report_error "you have to select an 'Input tree'"
112    fi
113
114    arb_export_tree "$INPUTTREE" > "$TREEFILE"
115}
116
117# --------------------------
118#      protocols helpers
119
120bootstrap_and_consenseIfReq() {
121    # run $BOOTSTRAP BS searches
122    $RAXML -b "$SEED" -m "$MODEL" -p "$SEED" -s "$SEQFILE" \
123        -N "$BOOTSTRAPS" \
124        -n BOOTSTRAP
125
126    if [ -n "$MRE" ]; then
127        # compute extended majority rule consensus tree
128        $RAXML -J MRE -m "$MODEL" -z RAxML_bootstrap.BOOTSTRAP -n BOOTSTRAP_CONSENSUS
129    fi
130}
131
132bootstrapAsyncIfRequested_and_wait() {
133    if [ "$BOOTSTRAPS" != "no" ]; then
134        if [ "$TRY_ASYNC" = "1" ]; then
135            if [ $(($THREADS * 2)) -gt $CORES ]; then
136            # wait for raxml (started by caller) to complete,
137            # if we have not enough cores to run bootstrap search at the same time
138                sleep 4 # just cosmetic (raxml output goes 1st)
139                echo "Note: Not enough cores found ($CORES) to run ML search and"
140                echo "      BS in parallel with $THREADS threads. Waiting..."
141                wait
142            fi
143        else
144            # otherwise always sync silently
145            wait
146        fi
147        bootstrap_and_consenseIfReq &
148    fi
149    wait # for all jobs
150}
151
152import_trees() {
153    local TPREFIX="$1"
154    local RUN="$2"
155    local COMMENT="$3"
156
157    local MAINTREE="${TPREFIX}.${RUN}"
158    # imports tree MAINTREE
159    # - with support values (if bootstrapping requested)
160    # - else "as is"
161    # imports MRE tree (if requested)
162
163    if [ "$BOOTSTRAPS" != "no" ]; then
164        # draw bipartition information
165        $RAXML -f b -m "$MODEL" \
166          -t "${TPREFIX}.${RUN}" \
167          -z RAxML_bootstrap.BOOTSTRAP \
168          -n "${RUN}_WITH_SUPPORT"
169
170        MAINTREE="RAxML_bipartitions.${RUN}_WITH_SUPPORT"
171        COMMENT="${COMMENT} BOOTSTRAPS=${BOOTSTRAPS}"
172    fi
173
174    arb_read_tree "${TREENAME}" "${MAINTREE}" "${COMMENT}"
175
176    if [ -n "$MRE" ]; then
177        if [ "$BOOTSTRAPS" != "no" ]; then
178            # otherwise no MRE tree possible
179            arb_read_tree "${TREENAME}_mre" RAxML_MajorityRuleExtendedConsensusTree.BOOTSTRAP_CONSENSUS \
180              "PRG=RAxML8 MRE consensus tree of $BOOTSTRAPS bootstrap searches performed for species in ${TREENAME}"
181        fi
182    fi
183}
184
185# -------------------
186#      protocols
187
188dna_tree_thorough() {
189    # do $REPEATS searches for best ML tree
190    $RAXML -f d -m "$MODEL" -p "$SEED" -s "$SEQFILE"  \
191        -N "$REPEATS" \
192        -n TREE_INFERENCE &
193
194    bootstrapAsyncIfRequested_and_wait
195
196    LIKELIHOOD=`extract_likelihood RAxML_info.TREE_INFERENCE 'Final\s*GAMMA-based\s*Score\s*of\s*best\s*tree'`
197    import_trees RAxML_bestTree TREE_INFERENCE "PRG=RAxML8 FILTER=$FILTER DIST=$MODEL LIKELIHOOD=${LIKELIHOOD} PROTOCOL=thorough"
198}
199
200dna_tree_quick() {
201    if [ "$BOOTSTRAPS" = "no" ]; then
202        report_error You have to select the number of bootstraps to perform
203    fi
204
205    # run fast bootstraps
206    $RAXML -f a -m "$MODEL" -p "$SEED" -x "$SEED" -s "$SEQFILE" \
207      -N "$BOOTSTRAPS" \
208      -n FAST_BS
209
210    # import
211    LIKELIHOOD=`extract_likelihood RAxML_info.FAST_BS 'Final\s*ML\s*Optimization\s*Likelihood:'`
212    arb_read_tree ${TREENAME} RAxML_bipartitions.FAST_BS "PRG=RAxML8 FILTER=$FILTER DIST=$MODEL LIKELIHOOD=${LIKELIHOOD} PROTOCOL=quick"
213
214    # create consensus tree
215    if [ -n "$MRE" ]; then
216        $RAXML -J MRE -m "$MODEL" -z RAxML_bootstrap.FAST_BS -n FAST_BS_MAJORITY
217        # import
218        arb_read_tree ${TREENAME}_mre RAxML_MajorityRuleExtendedConsensusTree.FAST_BS_MAJORITY \
219          "PRG=RAxML8 MRE consensus tree of $BOOTSTRAPS rapid-bootstraps performed while calculating ${TREENAME}"
220    fi
221}
222
223dna_tree_add() {
224    export_input_tree
225
226    $RAXML -f d -m "$MODEL" -p "$SEED" -s "$SEQFILE" \
227      -g "$TREEFILE" \
228      -n ADD &
229
230    bootstrapAsyncIfRequested_and_wait
231
232    LIKELIHOOD=`extract_likelihood RAxML_info.ADD 'Final\s*GAMMA-based\s*Score\s*of\s*best\s*tree'`
233    import_trees RAxML_bestTree ADD "PRG=RAxML8 FILTER=$FILTER DIST=$MODEL LIKELIHOOD=${LIKELIHOOD} PROTOCOL=add INPUTTREE=$INPUTTREE"
234}
235
236dna_tree_optimize() {
237    export_input_tree
238
239    $RAXML -f t -m "$MODEL" -p "$SEED" -s "$SEQFILE" \
240      -N "$REPEATS" \
241      -t "$TREEFILE" \
242      -n OPTIMIZE &
243
244    bootstrapAsyncIfRequested_and_wait
245
246    LIKELIHOOD=`extract_likelihood RAxML_info.OPTIMIZE 'Final\s*GAMMA-based\s*Score\s*of\s*best\s*tree'`
247    import_trees RAxML_bestTree OPTIMIZE "PRG=RAxML8 FILTER=$FILTER DIST=$MODEL LIKELIHOOD=${LIKELIHOOD} PROTOCOL=optimize INPUTTREE=$INPUTTREE"
248}
249
250dna_tree_calcblen() {
251    export_input_tree
252
253    $RAXML -f e -m "$MODEL" -s "$SEQFILE" \
254      -t "$TREEFILE" \
255      -n CALCBLEN &
256
257    bootstrapAsyncIfRequested_and_wait
258
259    LIKELIHOOD=`extract_likelihood RAxML_info.CALCBLEN 'Final\s*GAMMA\s*likelihood:'`
260    import_trees RAxML_result CALCBLEN "PRG=RAxML8 FILTER=$FILTER DIST=$MODEL LIKELIHOOD=${LIKELIHOOD} PROTOCOL=calcblen INPUTTREE=$INPUTTREE"
261}
262
263dna_tree_bootstrap() {
264    if [ "$BOOTSTRAPS" = "no" ]; then
265        report_error You have to select the number of bootstraps to perform
266    fi
267
268    export_input_tree
269    bootstrapAsyncIfRequested_and_wait
270    import_trees arb_export tree "PRG=RAxML8 FILTER=$FILTER DIST=$MODEL PROTOCOL=bootstrap INPUTTREE=$INPUTTREE"
271}
272
273dna_tree_score() {
274    export_input_tree
275
276    $RAXML -f n -m $MODEL -s "$SEQFILE" \
277      -z "$TREEFILE" \
278      -n SCORE
279
280    RESULT=`extract_likelihood RAxML_info.SCORE 'Tree\s*0\s*Likelihood'`
281    # RESULT contains sth like: -27819.642837 Tree-Length 6.899222
282    LIKELIHOOD=${RESULT/ Tree-Length */}
283    TREELEN=${RESULT/* Tree-Length /}
284
285    arb_write_tree_comment $INPUTTREE "RAxML8-score: FILTER=$FILTER DIST=$MODEL LIKELIHOOD=$LIKELIHOOD TREELEN=$TREELEN"
286}
287
288# --------------
289#      main
290
291MRE=Y
292TREENAME=raxml
293FILTER=unknown
294INPUTTREE=
295
296while [ -n "$1" ]; do
297  case "$1" in
298      -p) # protocol
299          PROTOCOL="$2"
300          shift
301          ;;
302      -m) # subst model
303          MODEL="$2"
304          shift
305          ;;
306      -s) # random seed
307          SEED="$2"
308          shift
309          ;;
310      -b) # bootstraps
311          BOOTSTRAPS="$2"
312          shift
313          ;;
314      -r) # number of tree searches
315          REPEATS="$2"
316          shift
317          ;;
318      -n) # tree name
319          TREENAME="$2"
320          shift
321          ;;
322      -nomre) # don't create mre tree
323          MRE=
324          ;;
325      -nt) # seqtype dna
326          SEQTYPE=N
327          ;;
328      -aa) # seqtype proto
329          SEQTYPE=A
330          ;;
331      -f) # input file
332          FILE="$2"
333          shift
334          ;;
335      -t) # threads
336          THREADS="$2"
337          shift
338          ;;
339      -it) # inputtree
340          INPUTTREE="$2"
341          if [ "$INPUTTREE" = 'tree_?????' ]; then # has to match ../../TEMPLATES/arb_global_defs.h@NO_TREE_SELECTED
342              INPUTTREE=
343          fi
344          shift
345          ;;
346      -fi) # filtername for comment
347          FILTER="$2"
348          shift
349          ;;
350      *)
351          report_error argument not understood: "$1"
352          ;;
353  esac
354  shift
355done
356
357# correct output treename (ensure prefix 'tree_', avoid things like 'tree_tree' etc)
358TREENAME="${TREENAME##tree}"
359TREENAME="${TREENAME#_}"
360TREENAME="${TREENAME#_}"
361TREENAME="tree_${TREENAME}"
362
363# use time as random SEED if empty
364if [ -z "$SEED" ]; then
365    # seconds since 1970
366    SEED=`date +%s`
367fi
368
369# prepare data in tempdir
370DIR="`pwd`" # tempdir now generated by caller
371SEQFILE="dna.phy"
372FULLSEQFILE="${DIR}/${SEQFILE}"
373
374arb_convert_aln --arb-notify -GenBank "$FILE" -phylip "${FULLSEQFILE}" 2>&1 |\
375  grep -v "^WARNING(14): LOCUS" || true # remove spurious warning
376rm "$FILE"
377
378cd "$DIR"
379
380# decide whether async execution of BS and main algo makes sense (ie. runtimes of both are expected similar)
381TRY_ASYNC=0
382case "${PROTOCOL}" in
383    (thorough|optimize)
384        if [ "$BOOTSTRAPS" != "no" ]; then
385            TRY_ASYNC=1
386        fi
387        ;;
388esac
389
390# select RAxML binary
391if cpu_has_feature avx && can_run raxmlHPC8-AVX.PTHREADS; then
392    RAXML=raxmlHPC8-AVX.PTHREADS
393elif cpu_has_feature sse3 && can_run raxmlHPC8-SSE3.PTHREADS; then
394    RAXML=raxmlHPC8-SSE3.PTHREADS
395elif can_run raxmlHPC8-PTHREADS; then
396    RAXML=raxmlHPC8-PTHREADS
397else
398    report_error Could not find working RAxML binary.
399fi
400
401# get some numbers
402CORES=`cpu_get_cores`
403read NSEQS BP_ARB < <(head -n 1 $SEQFILE)
404
405# retrieve number of alignment patterns recognized by RAxML
406$RAXML -T 2 -f j -s "$SEQFILE" -b 123 -N 1 -m "$MODEL" -n PATTERNS
407BP=`extract_line_suffix RAxML_info.PATTERNS "Alignment Patterns:"`
408
409# warn if model is not recommended for given number of sequences
410BAD_PRACTICE="This is not considered good practice.\nPlease refer to the RAxML manual for details."
411if [ "$MODEL" == "GTRGAMMA" -a "$NSEQS" -gt 10000 ]; then
412    arb_message "Using the GTRGAMMA model on more than 10,000 sequences.\n$BAD_PRACTICE"
413fi
414if [ "$MODEL" == "GTRCAT" -a "$NSEQS" -lt 150 ]; then
415    arb_message "Using the GTRCAT model on less than 150 sequences.\n$BAD_PRACTICE"
416fi
417
418# -----------------------------------
419#      threads / cores detection
420
421CORES=$(( $CORES + 1 - 1 ))
422# calculate number of threads (if not passed)
423MAX_THREADS=$(( ( $BP - 1 ) / $BASES_PER_THREAD + 2))
424# +1 is for master thread,
425# another +1 for the first $BASES_PER_THREAD (bash truncates); -1 to avoid extra thread if BP is divisible by BASES_PER_THREAD
426
427if [ "$CORES" -lt 1 ]; then
428    # failed to detect CORES
429    SETENVAR_HINT="set the environment variable PBS_NP to the number of cores available (before you start arb)"
430    if [ -z "$THREADS" ]; then
431        report_error "failed to detect number of cores.\nPlease specify 'CPU thread override' or\n${SETENVAR_HINT}."
432    else
433        CORES=$THREADS
434        if [ "$TRY_ASYNC" = "1" ]; then
435            echo "Warning: failed to detect number of cores (assuming ${CORES} from 'CPU thread override')"
436            echo "Please ${SETENVAR_HINT}."
437        else
438            arb_message "Warning: failed to detect number of cores\n=> parallel bootstrapping disabled\nPlease ${SETENVAR_HINT} to avoid that."
439        fi
440    fi
441else
442    MAX_TH_NOTE="maximum useful thread-count for alignment with ${BP} bp would be ${MAX_THREADS}"
443    if [ -z "$THREADS" ]; then
444        if [ "$CORES" -lt "$MAX_THREADS" ]; then
445            echo "Note: Limiting threads to $CORES available cores (${MAX_TH_NOTE})"
446            THREADS=$CORES
447        else
448            THREADS=$MAX_THREADS
449            if [ "$TRY_ASYNC" = "1" ]; then
450                if [ "$(($THREADS * 2))" -gt "$CORES" ]; then
451                    # split threads between BS and ML search
452                    if [ "$((($THREADS-1) * 2))" -le "$CORES" ]; then
453                        THREADS=$(($THREADS-1))
454                    else
455                        if [ "$((($THREADS-2) * 2))" -le "$CORES" ]; then
456                            THREADS=$(($THREADS-2))
457                        fi
458                    fi
459                fi
460                if [ "$THREADS" -lt "$MAX_THREADS" ]; then
461                    arb_message "Note: reduced threads to $THREADS to allow parallel execution of bootstrapping\nset 'CPU thread override' to ${MAX_THREADS} to avoid that"
462                fi
463            fi
464        fi
465        if [ "$THREADS" -lt 2 ]; then
466            # use at least 2 threads (required by PTHREADS version)
467            THREADS=2
468        fi
469    else
470        echo "Note: Threads forced to ${THREADS} (${MAX_TH_NOTE})"
471    fi
472fi
473
474if [ "$CORES" -lt "$THREADS" ]; then
475    arb_message "Performance-Warning: Using $THREADS threads on $CORES cores"
476fi
477
478RAXML="$RAXML -T $THREADS"
479
480case "${SEQTYPE}.${PROTOCOL}" in
481    N.thorough)
482        time dna_tree_thorough
483        ;;
484    N.quick)
485        time dna_tree_quick
486        ;;
487    N.add)
488        time dna_tree_add
489        ;;
490    N.optimize)
491        time dna_tree_optimize
492        ;;
493    N.calcblen)
494        time dna_tree_calcblen
495        ;;
496    N.bootstrap)
497        time dna_tree_bootstrap
498        ;;
499    N.score)
500        time dna_tree_score
501        ;;
502    *)
503        report_error Unknown protocol "${SEQTYPE}.${PROTOCOL}"
504        ;;
505esac
506
507# @@@ FIXME: cleanup temp dir
508
509exit
Note: See TracBrowser for help on using the repository browser.