1 | /* |
---|
2 | * AlignedSequenceLoader.cxx |
---|
3 | * |
---|
4 | * Created on: Feb 15, 2010 |
---|
5 | * Author: Breno Faria |
---|
6 | * |
---|
7 | * Institute of Microbiology (Technical University Munich) |
---|
8 | * http://www.arb-home.de/ |
---|
9 | */ |
---|
10 | |
---|
11 | |
---|
12 | #include "AlignedSequenceLoader.h" |
---|
13 | |
---|
14 | #include <arbdbt.h> |
---|
15 | |
---|
16 | /** |
---|
17 | * Loads the marked sequences aligned from Arb's DB. |
---|
18 | * This loader only considers letters given by the following regular |
---|
19 | * expression: [ACGTUacgtu] |
---|
20 | */ |
---|
21 | AlignedSequenceLoader::AlignedSequenceLoader(GBDATA *gb_main) : |
---|
22 | error(NULp), |
---|
23 | seqs(NULp) |
---|
24 | { |
---|
25 | error = GB_push_transaction(gb_main); |
---|
26 | if (!error) { |
---|
27 | char *al_name = GBT_get_default_alignment(gb_main); |
---|
28 | if (!al_name) { |
---|
29 | error = GB_await_error(); |
---|
30 | } |
---|
31 | else { |
---|
32 | int al_len = GBT_get_alignment_len(gb_main, al_name); |
---|
33 | |
---|
34 | if (al_len <= 0) { |
---|
35 | error = GB_await_error(); |
---|
36 | } |
---|
37 | else { |
---|
38 | seqs = new VecVecType(0); |
---|
39 | MSA_len = al_len; |
---|
40 | |
---|
41 | size_t occurrences[MSA_len]; |
---|
42 | for (size_t i = 0; i < MSA_len; i++) occurrences[i] = 0; |
---|
43 | |
---|
44 | cout << "loading marked species: "; |
---|
45 | flush(cout); |
---|
46 | for (GBDATA *gb_species = GBT_first_marked_species(gb_main); |
---|
47 | gb_species && !error; |
---|
48 | gb_species = GBT_next_marked_species(gb_species)) |
---|
49 | { |
---|
50 | GBDATA *gb_data = GBT_find_sequence(gb_species, al_name); |
---|
51 | if (!gb_data) { |
---|
52 | error = GBS_global_string("species '%s' has no data in selected alignment '%s'", |
---|
53 | GBT_get_name_or_description(gb_species), |
---|
54 | al_name); |
---|
55 | } |
---|
56 | else { // existing alignment data for this species |
---|
57 | cout << GBT_get_name_or_description(gb_species) << " "; |
---|
58 | flush(cout); |
---|
59 | |
---|
60 | string sequence = GB_read_char_pntr(gb_data); |
---|
61 | |
---|
62 | string *seq_as_vec = new string[MSA_len]; |
---|
63 | int k = 0; |
---|
64 | for (string::iterator i = sequence.begin(); i != sequence.end(); ++i) { |
---|
65 | switch (*i) { |
---|
66 | case 'A': |
---|
67 | case 'a': |
---|
68 | seq_as_vec[k] = "A"; |
---|
69 | occurrences[k] += 1; |
---|
70 | break; |
---|
71 | case 'C': |
---|
72 | case 'c': |
---|
73 | seq_as_vec[k] = "C"; |
---|
74 | occurrences[k] += 1; |
---|
75 | break; |
---|
76 | case 'G': |
---|
77 | case 'g': |
---|
78 | seq_as_vec[k] = "G"; |
---|
79 | occurrences[k] += 1; |
---|
80 | break; |
---|
81 | case 'T': |
---|
82 | case 't': |
---|
83 | case 'U': |
---|
84 | case 'u': |
---|
85 | seq_as_vec[k] = "T"; |
---|
86 | occurrences[k] += 1; |
---|
87 | break; |
---|
88 | default: |
---|
89 | seq_as_vec[k] = "-"; |
---|
90 | break; |
---|
91 | } |
---|
92 | k++; |
---|
93 | } |
---|
94 | |
---|
95 | arb_assert((size_t)k == MSA_len); |
---|
96 | vector<string> seq_vector(&seq_as_vec[0], &seq_as_vec[k]); |
---|
97 | delete [] seq_as_vec; |
---|
98 | |
---|
99 | seqs->push_back(seq_vector); |
---|
100 | } |
---|
101 | } |
---|
102 | |
---|
103 | cout << "done. Total number of species: " << seqs->size() << endl; |
---|
104 | flush(cout); |
---|
105 | |
---|
106 | cleanSeqs(occurrences, MSA_len); |
---|
107 | } |
---|
108 | |
---|
109 | free(al_name); |
---|
110 | } |
---|
111 | } |
---|
112 | |
---|
113 | error = GB_end_transaction(gb_main, error); |
---|
114 | } |
---|
115 | |
---|
116 | /** |
---|
117 | * Returns the aligned seqs. |
---|
118 | * |
---|
119 | * @return the aligned seqs. |
---|
120 | */ |
---|
121 | VecVecType* AlignedSequenceLoader::getSequences() { |
---|
122 | arb_assert(!has_error()); |
---|
123 | arb_assert(seqs); |
---|
124 | return seqs; |
---|
125 | } |
---|
126 | |
---|
127 | /** |
---|
128 | * Destructor. |
---|
129 | */ |
---|
130 | AlignedSequenceLoader::~AlignedSequenceLoader() { |
---|
131 | delete seqs; |
---|
132 | } |
---|
133 | |
---|
134 | /** |
---|
135 | * Getter for the MSA_len. |
---|
136 | * |
---|
137 | * @return the MSA length. |
---|
138 | */ |
---|
139 | size_t AlignedSequenceLoader::getMsaLen() { |
---|
140 | arb_assert(!has_error()); |
---|
141 | return MSA_len; |
---|
142 | } |
---|
143 | |
---|
144 | /** |
---|
145 | * Getter for the position map. The position map maps positions in the clean |
---|
146 | * sequence to the original positions in the alignment. |
---|
147 | * |
---|
148 | * @return the position map. |
---|
149 | */ |
---|
150 | const vector<size_t>& AlignedSequenceLoader::getPositionMap() { |
---|
151 | arb_assert(!has_error()); |
---|
152 | return position_map; |
---|
153 | } |
---|
154 | |
---|
155 | /** |
---|
156 | * This method cleans-up the empty positions of the MSA. |
---|
157 | * |
---|
158 | * |
---|
159 | * @param occurrences: a list gathering the number of occurrences of bases at |
---|
160 | * each position of the MSA. |
---|
161 | * @param len: the length of occurrences. |
---|
162 | */ |
---|
163 | void AlignedSequenceLoader::cleanSeqs(const size_t *occurrences, long len) { |
---|
164 | |
---|
165 | cout << "cleaning-up sequences of empty positions... " << endl; |
---|
166 | flush( cout); |
---|
167 | |
---|
168 | size_t num_of_bases = 0; |
---|
169 | for (int i = 0; i < len; i++) { |
---|
170 | if (occurrences[i] != 0) { |
---|
171 | num_of_bases++; |
---|
172 | } |
---|
173 | } |
---|
174 | |
---|
175 | cout << "number of non-empty positions in MSA: " << num_of_bases |
---|
176 | << ". Filtered out " << len - num_of_bases << " positions." << endl; |
---|
177 | |
---|
178 | VecVecType *clean_seqs = new VecVecType(0); |
---|
179 | |
---|
180 | cout << "computing position map..." << endl; |
---|
181 | position_map.resize(num_of_bases, 0); |
---|
182 | |
---|
183 | int j = 0; |
---|
184 | for (int i = 0; i < len; i++) { |
---|
185 | if (occurrences[i] != 0) { |
---|
186 | position_map.at(j) = i; |
---|
187 | j++; |
---|
188 | } |
---|
189 | } |
---|
190 | |
---|
191 | for (VecVecType::iterator seq = seqs->begin(); seq != seqs->end(); ++seq) { |
---|
192 | // for every sequence |
---|
193 | vector<string> sequence(num_of_bases, ""); |
---|
194 | int jj = 0; |
---|
195 | for (int i = 0; i < len; ++i) { |
---|
196 | if (occurrences[i] != 0) { |
---|
197 | sequence.at(jj) = seq->at(i); |
---|
198 | jj++; |
---|
199 | } |
---|
200 | } |
---|
201 | arb_assert(sequence.size() == num_of_bases); |
---|
202 | clean_seqs->push_back(sequence); |
---|
203 | } |
---|
204 | |
---|
205 | delete seqs; // before overwriting it |
---|
206 | seqs = clean_seqs; |
---|
207 | |
---|
208 | cout << "clean-up done." << endl; |
---|
209 | |
---|
210 | } |
---|
211 | |
---|