1 | /* |
---|
2 | * AlignedSequenceLoader.cxx |
---|
3 | * |
---|
4 | * Created on: Feb 15, 2010 |
---|
5 | * Author: Breno Faria |
---|
6 | * |
---|
7 | * Institute of Microbiology (Technical University Munich) |
---|
8 | * http://www.arb-home.de/ |
---|
9 | */ |
---|
10 | |
---|
11 | |
---|
12 | #include "AlignedSequenceLoader.h" |
---|
13 | |
---|
14 | #include <arbdb.h> |
---|
15 | #include <arbdbt.h> |
---|
16 | #include "dbconn.h" |
---|
17 | |
---|
18 | /** |
---|
19 | * Loads the marked sequences aligned from Arb's DB. |
---|
20 | * This loader only considers letters given by the following regular |
---|
21 | * expression: [ACGTUacgtu] |
---|
22 | */ |
---|
23 | AlignedSequenceLoader::AlignedSequenceLoader() { |
---|
24 | |
---|
25 | GBDATA *gb_main = runningDatabase(); |
---|
26 | |
---|
27 | GB_ERROR error = GB_push_transaction(gb_main); |
---|
28 | if (error) { |
---|
29 | cout << "RNACMA-Error: " << error << "\n"; |
---|
30 | exit(EXIT_FAILURE); |
---|
31 | } |
---|
32 | |
---|
33 | seqs = new VecVecType(0); |
---|
34 | |
---|
35 | char *al_name = GBT_get_default_alignment(gb_main); |
---|
36 | MSA_len = GBT_get_alignment_len(gb_main, al_name); |
---|
37 | |
---|
38 | size_t occurrences[MSA_len]; |
---|
39 | for (size_t i = 0; i < MSA_len; i++) |
---|
40 | occurrences[i] = 0; |
---|
41 | |
---|
42 | cout << "loading marked species: "; |
---|
43 | flush( cout); |
---|
44 | for (GBDATA *gb_species = GBT_first_marked_species(gb_main); gb_species; gb_species |
---|
45 | = GBT_next_marked_species(gb_species)) { |
---|
46 | |
---|
47 | GBDATA *gb_ali = GB_entry(gb_species, al_name); |
---|
48 | |
---|
49 | if (gb_ali) { // existing alignment for this species |
---|
50 | GBDATA *gb_data = GB_entry(gb_ali, "data"); |
---|
51 | |
---|
52 | if (gb_data) { |
---|
53 | |
---|
54 | cout << GBT_read_name(gb_species) << " "; |
---|
55 | flush(cout); |
---|
56 | |
---|
57 | string sequence = GB_read_string(gb_data); |
---|
58 | |
---|
59 | string *seq_as_vec = new string[MSA_len]; |
---|
60 | int k = 0; |
---|
61 | for (string::iterator i = sequence.begin(); i != sequence.end(); ++i) { |
---|
62 | switch (*i) { |
---|
63 | case 'A': |
---|
64 | case 'a': |
---|
65 | seq_as_vec[k] = "A"; |
---|
66 | occurrences[k] += 1; |
---|
67 | break; |
---|
68 | case 'C': |
---|
69 | case 'c': |
---|
70 | seq_as_vec[k] = "C"; |
---|
71 | occurrences[k] += 1; |
---|
72 | break; |
---|
73 | case 'G': |
---|
74 | case 'g': |
---|
75 | seq_as_vec[k] = "G"; |
---|
76 | occurrences[k] += 1; |
---|
77 | break; |
---|
78 | case 'T': |
---|
79 | case 't': |
---|
80 | case 'U': |
---|
81 | case 'u': |
---|
82 | seq_as_vec[k] = "T"; |
---|
83 | occurrences[k] += 1; |
---|
84 | break; |
---|
85 | default: |
---|
86 | seq_as_vec[k] = "-"; |
---|
87 | break; |
---|
88 | } |
---|
89 | k++; |
---|
90 | } |
---|
91 | |
---|
92 | assert((size_t)k == MSA_len); |
---|
93 | vector<string> seq_vector(&seq_as_vec[0], &seq_as_vec[k]); |
---|
94 | delete [] seq_as_vec; |
---|
95 | |
---|
96 | seqs->push_back(seq_vector); |
---|
97 | } |
---|
98 | } |
---|
99 | |
---|
100 | } |
---|
101 | |
---|
102 | GB_pop_transaction(gb_main); |
---|
103 | |
---|
104 | cout << "done. Total number of species: " << seqs->size() << endl; |
---|
105 | flush(cout); |
---|
106 | |
---|
107 | cleanSeqs(occurrences, MSA_len); |
---|
108 | } |
---|
109 | |
---|
110 | /** |
---|
111 | * Returns the aligned seqs. |
---|
112 | * |
---|
113 | * @return the aligned seqs. |
---|
114 | */ |
---|
115 | VecVecType* AlignedSequenceLoader::getSequences() { |
---|
116 | return seqs; |
---|
117 | } |
---|
118 | |
---|
119 | /** |
---|
120 | * Destructor. |
---|
121 | */ |
---|
122 | AlignedSequenceLoader::~AlignedSequenceLoader() { |
---|
123 | delete seqs; |
---|
124 | } |
---|
125 | |
---|
126 | /** |
---|
127 | * Getter for the MSA_len. |
---|
128 | * |
---|
129 | * @return the MSA length. |
---|
130 | */ |
---|
131 | size_t AlignedSequenceLoader::getMsaLen() { |
---|
132 | return MSA_len; |
---|
133 | } |
---|
134 | |
---|
135 | /** |
---|
136 | * Getter for the position map. The position map maps positions in the clean |
---|
137 | * sequence to the original positions in the alignment. |
---|
138 | * |
---|
139 | * @return the position map. |
---|
140 | */ |
---|
141 | vector<size_t> * AlignedSequenceLoader::getPositionMap() { |
---|
142 | return position_map; |
---|
143 | } |
---|
144 | |
---|
145 | /** |
---|
146 | * This method cleans-up the empty positions of the MSA. |
---|
147 | * |
---|
148 | * |
---|
149 | * @param occurrences: a list gathering the number of occurrences of bases at |
---|
150 | * each position of the MSA. |
---|
151 | * @param len: the length of occurrences. |
---|
152 | */ |
---|
153 | void AlignedSequenceLoader::cleanSeqs(size_t *occurrences, long len) { |
---|
154 | |
---|
155 | cout << "cleaning-up sequences of empty positions... " << endl; |
---|
156 | flush( cout); |
---|
157 | |
---|
158 | size_t num_of_bases = 0; |
---|
159 | for (int i = 0; i < len; i++) { |
---|
160 | if (occurrences[i] != 0) { |
---|
161 | num_of_bases++; |
---|
162 | } |
---|
163 | } |
---|
164 | |
---|
165 | cout << "number of non-empty positions in MSA: " << num_of_bases |
---|
166 | << ". Filtered out " << len - num_of_bases << " positions." << endl; |
---|
167 | |
---|
168 | VecVecType *clean_seqs = new VecVecType(0); |
---|
169 | |
---|
170 | cout << "computing position map..." << endl; |
---|
171 | position_map = new vector<size_t> (num_of_bases, 0); |
---|
172 | int j = 0; |
---|
173 | for (int i = 0; i < len; i++) { |
---|
174 | if (occurrences[i] != 0) { |
---|
175 | position_map->at(j) = i; |
---|
176 | j++; |
---|
177 | } |
---|
178 | } |
---|
179 | |
---|
180 | for (VecVecType::iterator seq = seqs->begin(); seq != seqs->end(); ++seq) { |
---|
181 | //for every sequence |
---|
182 | vector<string> sequence(num_of_bases, ""); |
---|
183 | int jj = 0; |
---|
184 | for (int i = 0; i < len; ++i) { |
---|
185 | if (occurrences[i] != 0) { |
---|
186 | sequence.at(jj) = seq->at(i); |
---|
187 | jj++; |
---|
188 | } |
---|
189 | } |
---|
190 | assert(sequence.size() == num_of_bases); |
---|
191 | clean_seqs->push_back(sequence); |
---|
192 | } |
---|
193 | |
---|
194 | seqs = clean_seqs; |
---|
195 | |
---|
196 | cout << "clean-up done." << endl; |
---|
197 | |
---|
198 | } |
---|
199 | |
---|