1 | #ifndef PROBE_H |
---|
2 | #define PROBE_H |
---|
3 | |
---|
4 | #ifndef __LIST__ |
---|
5 | #include <list> |
---|
6 | #endif |
---|
7 | #ifndef __SET__ |
---|
8 | #include <set> |
---|
9 | #endif |
---|
10 | |
---|
11 | #ifndef ARBDB_H |
---|
12 | #include <arbdb.h> |
---|
13 | #endif |
---|
14 | #ifndef PT_COM_H |
---|
15 | #include <PT_com.h> |
---|
16 | #endif |
---|
17 | #ifndef AISC_GEN_SERVER_INCLUDED |
---|
18 | #include <PT_server.h> |
---|
19 | #endif |
---|
20 | #ifndef PT_TOOLS_H |
---|
21 | #include "PT_tools.h" |
---|
22 | #endif |
---|
23 | #ifndef CACHE_H |
---|
24 | #include <cache.h> |
---|
25 | #endif |
---|
26 | |
---|
27 | #define PT_SERVER_MAGIC 0x32108765 // pt server identifier |
---|
28 | #define PT_SERVER_VERSION 3 // version of pt server database (no versioning till 2009/05/13) |
---|
29 | |
---|
30 | #if defined(DEBUG) |
---|
31 | // # define PTM_DEBUG_NODES |
---|
32 | # define PTM_DEBUG_STAGE_ASSERTIONS |
---|
33 | // # define PTM_DEBUG_VALIDATE_CHAINS |
---|
34 | #endif // DEBUG |
---|
35 | |
---|
36 | #define CALCULATE_STATS_ON_QUERY |
---|
37 | |
---|
38 | #if defined(PTM_DEBUG_STAGE_ASSERTIONS) |
---|
39 | #define pt_assert_stage(s) pt_assert(psg.get_stage() == (s)) |
---|
40 | #else // !defined(PTM_DEBUG_STAGE_ASSERTIONS) |
---|
41 | #define pt_assert_stage(s) |
---|
42 | #endif |
---|
43 | |
---|
44 | #if defined(PTM_DEBUG_VALIDATE_CHAINS) |
---|
45 | #define pt_assert_valid_chain_stage1(node) pt_assert(PT_chain_has_valid_entries<ChainIteratorStage1>(node)) |
---|
46 | #else // !defined(PTM_DEBUG_VALIDATE_CHAINS) |
---|
47 | #define pt_assert_valid_chain_stage1(node) |
---|
48 | #endif |
---|
49 | |
---|
50 | typedef unsigned long ULONG; |
---|
51 | typedef unsigned int UINT; |
---|
52 | typedef unsigned char uchar; |
---|
53 | |
---|
54 | #define PT_MAX_PARTITION_DEPTH 4 |
---|
55 | |
---|
56 | #define PT_POS_TREE_HEIGHT 20 |
---|
57 | #define PT_MIN_TREE_HEIGHT PT_MAX_PARTITION_DEPTH |
---|
58 | |
---|
59 | #define MIN_PROBE_LENGTH 2 |
---|
60 | |
---|
61 | enum PT_MATCH_TYPE { |
---|
62 | PT_MATCH_TYPE_INTEGER = 0, |
---|
63 | PT_MATCH_TYPE_WEIGHTED_PLUS_POS = 1, |
---|
64 | PT_MATCH_TYPE_WEIGHTED = -1 |
---|
65 | }; |
---|
66 | |
---|
67 | |
---|
68 | |
---|
69 | #define MATCHANSWER 50 // private msg type: match result answer |
---|
70 | #define CREATEANSWER 51 // private msg type: create result answer |
---|
71 | #define FINDANSWER 52 // private msg type: find result answer |
---|
72 | |
---|
73 | extern int gene_flag; // if 'gene_flag' == 1 -> we are a gene pt server |
---|
74 | struct Hs_struct; |
---|
75 | |
---|
76 | enum type_types { |
---|
77 | t_int = 1, |
---|
78 | t_string = 0, |
---|
79 | t_float = 2 |
---|
80 | }; |
---|
81 | |
---|
82 | enum PT_base { |
---|
83 | PT_QU = 0, |
---|
84 | PT_N = 1, |
---|
85 | PT_A, |
---|
86 | PT_C, |
---|
87 | PT_G, |
---|
88 | PT_T, |
---|
89 | PT_BASES, |
---|
90 | PT_B_UNDEF, |
---|
91 | }; |
---|
92 | |
---|
93 | CONSTEXPR_INLINE bool is_std_base (char b) { return b >= PT_A && b <= PT_T; } |
---|
94 | CONSTEXPR_INLINE bool is_std_base_or_N(char b) { return b >= PT_N && b <= PT_T; } |
---|
95 | CONSTEXPR_INLINE bool is_ambig_base (char b) { return b == PT_QU || b == PT_N; } |
---|
96 | CONSTEXPR_INLINE bool is_valid_base (char b) { return b >= PT_QU && b < PT_BASES; } |
---|
97 | |
---|
98 | CONSTEXPR_INLINE char base_2_readable(char base) { |
---|
99 | return base<PT_BASES ? ".NACGU"[safeCharIndex(base)] : base; |
---|
100 | } |
---|
101 | |
---|
102 | inline char *probe_2_readable(char *id_string, int len) { |
---|
103 | //! translate a string containing PT_base into readable characters. |
---|
104 | // caution if 'id_string' contains PT_QU ( == zero == EOS). |
---|
105 | // (see also: probe_compress_sequence) |
---|
106 | for (int i = 0; i<len; ++i) { |
---|
107 | id_string[i] = base_2_readable(id_string[i]); |
---|
108 | } |
---|
109 | return id_string; |
---|
110 | } |
---|
111 | |
---|
112 | inline void reverse_probe(char *seq, int len) { |
---|
113 | int i = 0; |
---|
114 | int j = len-1; |
---|
115 | |
---|
116 | while (i<j) std::swap(seq[i++], seq[j--]); |
---|
117 | } |
---|
118 | |
---|
119 | struct POS_TREE1; |
---|
120 | struct POS_TREE2; |
---|
121 | |
---|
122 | enum Stage { STAGE1, STAGE2 }; |
---|
123 | |
---|
124 | // --------------------- |
---|
125 | // Probe search |
---|
126 | |
---|
127 | class probe_input_data : virtual Noncopyable { // every taxa's own data |
---|
128 | int size; // @@@ misleading (contains 1 if no bases in sequence) |
---|
129 | GBDATA *gb_species; |
---|
130 | bool group; // probe_design: whether species is in group |
---|
131 | |
---|
132 | typedef SmartArrayPtr(int) SmartIntPtr; |
---|
133 | |
---|
134 | mutable cache::CacheHandle<SmartCharPtr> seq; |
---|
135 | mutable cache::CacheHandle<SmartIntPtr> rel2abs; |
---|
136 | |
---|
137 | static cache::Cache<SmartCharPtr> seq_cache; |
---|
138 | static cache::Cache<SmartIntPtr> rel2abs_cache; |
---|
139 | |
---|
140 | SmartCharPtr loadSeq() const { |
---|
141 | GB_transaction ta(gb_species); |
---|
142 | GBDATA *gb_compr = GB_entry(gb_species, "compr"); |
---|
143 | return SmartCharPtr(GB_read_bytes(gb_compr)); |
---|
144 | } |
---|
145 | |
---|
146 | public: |
---|
147 | |
---|
148 | probe_input_data() |
---|
149 | : size(0), |
---|
150 | gb_species(NULp), |
---|
151 | group(false) |
---|
152 | {} |
---|
153 | ~probe_input_data() { |
---|
154 | seq.release(seq_cache); |
---|
155 | rel2abs.release(rel2abs_cache); |
---|
156 | } |
---|
157 | |
---|
158 | static void set_cache_sizes(size_t seq_cache_size, size_t abs_cache_size) { |
---|
159 | seq_cache.resize(seq_cache_size); |
---|
160 | rel2abs_cache.resize(abs_cache_size); |
---|
161 | } |
---|
162 | |
---|
163 | GB_ERROR init(GBDATA *gb_species_); |
---|
164 | |
---|
165 | SmartCharPtr get_dataPtr() const { |
---|
166 | if (!seq.is_cached()) seq.assign(loadSeq(), seq_cache); |
---|
167 | return seq.access(seq_cache); |
---|
168 | } |
---|
169 | |
---|
170 | const char *get_shortname() const { |
---|
171 | GB_transaction ta(gb_species); |
---|
172 | |
---|
173 | GBDATA *gb_name = GB_entry(gb_species, "name"); |
---|
174 | pt_assert(gb_name); |
---|
175 | return GB_read_char_pntr(gb_name); |
---|
176 | } |
---|
177 | const char *get_fullname() const { |
---|
178 | GB_transaction ta(gb_species); |
---|
179 | |
---|
180 | GBDATA *gb_full = GB_entry(gb_species, "full_name"); |
---|
181 | return gb_full ? GB_read_char_pntr(gb_full) : ""; |
---|
182 | } |
---|
183 | |
---|
184 | const char *get_acc() const { |
---|
185 | GB_transaction ta(gb_species); |
---|
186 | |
---|
187 | GBDATA *gb_acc = GB_entry(gb_species, "acc"); |
---|
188 | pt_assert(gb_acc); |
---|
189 | return gb_acc ? GB_read_char_pntr(gb_acc) : NULp; |
---|
190 | } |
---|
191 | int get_start() const { |
---|
192 | GB_transaction ta(gb_species); |
---|
193 | |
---|
194 | GBDATA *gb_start = GB_entry(gb_species, "start"); |
---|
195 | return gb_start ? GB_read_int(gb_start) : 0; |
---|
196 | } |
---|
197 | int get_stop() const { |
---|
198 | GB_transaction ta(gb_species); |
---|
199 | |
---|
200 | GBDATA *gb_stop = GB_entry(gb_species, "stop"); |
---|
201 | return gb_stop ? GB_read_int(gb_stop) : 0; |
---|
202 | } |
---|
203 | |
---|
204 | long get_checksum() const { // @@@ change return-type -> uint32_t |
---|
205 | GB_transaction ta(gb_species); |
---|
206 | GBDATA *gb_cs = GB_entry(gb_species, "cs"); |
---|
207 | pt_assert(gb_cs); |
---|
208 | uint32_t csum = uint32_t(GB_read_int(gb_cs)); |
---|
209 | return csum; |
---|
210 | } |
---|
211 | int get_size() const { return size; } |
---|
212 | |
---|
213 | bool valid_rel_pos(int rel_pos) const { // returns true if rel_pos is inside sequence |
---|
214 | return rel_pos >= 0 && rel_pos<get_size(); |
---|
215 | } |
---|
216 | |
---|
217 | bool inside_group() const { return group; } |
---|
218 | bool outside_group() const { return !group; } |
---|
219 | |
---|
220 | void set_group_state(bool isGroupMember) { group = isGroupMember; } |
---|
221 | |
---|
222 | long get_geneabspos() const { |
---|
223 | pt_assert(gene_flag); // only legal in gene-ptserver |
---|
224 | GBDATA *gb_pos = GB_entry(gb_species, "abspos"); |
---|
225 | if (gb_pos) return GB_read_int(gb_pos); |
---|
226 | return -1; |
---|
227 | } |
---|
228 | |
---|
229 | void preload_rel2abs() const { |
---|
230 | if (!rel2abs.is_cached()) { |
---|
231 | GB_transaction ta(gb_species); |
---|
232 | |
---|
233 | GBDATA *gb_baseoff = GB_entry(gb_species, "baseoff"); |
---|
234 | pt_assert(gb_baseoff); |
---|
235 | |
---|
236 | const GB_CUINT4 *baseoff = GB_read_ints_pntr(gb_baseoff); |
---|
237 | |
---|
238 | int *r2a = new int[size]; |
---|
239 | |
---|
240 | int abs = 0; |
---|
241 | for (int rel = 0; rel<size; ++rel) { |
---|
242 | abs += baseoff[rel]; |
---|
243 | r2a[rel] = abs; |
---|
244 | } |
---|
245 | |
---|
246 | rel2abs.assign(r2a, rel2abs_cache); |
---|
247 | } |
---|
248 | } |
---|
249 | size_t get_abspos(size_t rel_pos) const { |
---|
250 | pt_assert(rel2abs.is_cached()); // you need to call preload_rel2abs |
---|
251 | return (&*rel2abs.access(rel2abs_cache))[rel_pos]; // @@@ brute-forced |
---|
252 | } |
---|
253 | |
---|
254 | size_t calc_relpos(int abs_pos) const { // expensive |
---|
255 | preload_rel2abs(); |
---|
256 | SmartIntPtr rel2abs_ptr = rel2abs.access(rel2abs_cache); |
---|
257 | const int *r2a = &*rel2abs_ptr; |
---|
258 | |
---|
259 | int l = 0; |
---|
260 | int h = get_size()-1; |
---|
261 | |
---|
262 | if (r2a[l] == abs_pos) return l; |
---|
263 | if (r2a[h] == abs_pos) return h; |
---|
264 | |
---|
265 | while (l<h) { |
---|
266 | int m = (l+h)/2; |
---|
267 | if (r2a[m]<abs_pos) { |
---|
268 | l = m; |
---|
269 | } |
---|
270 | else if (r2a[m]>abs_pos) { |
---|
271 | h = m; |
---|
272 | } |
---|
273 | else { |
---|
274 | return m; |
---|
275 | } |
---|
276 | } |
---|
277 | return l; |
---|
278 | } |
---|
279 | }; |
---|
280 | |
---|
281 | struct probe_statistic_struct { |
---|
282 | // common |
---|
283 | int cut_offs; // statistic of chains |
---|
284 | int single_node; |
---|
285 | int short_node; |
---|
286 | int long_node; |
---|
287 | int longs; |
---|
288 | int shorts; |
---|
289 | int shorts2; |
---|
290 | int chars; |
---|
291 | |
---|
292 | #ifdef ARB_64 |
---|
293 | // 64bit specials |
---|
294 | int int_node; |
---|
295 | int ints; |
---|
296 | int ints2; |
---|
297 | long maxdiff; |
---|
298 | #endif |
---|
299 | |
---|
300 | void setup(); |
---|
301 | }; |
---|
302 | |
---|
303 | class BI_ecoli_ref; |
---|
304 | |
---|
305 | class MostUsedPos : virtual Noncopyable { |
---|
306 | int pos; |
---|
307 | int used; |
---|
308 | |
---|
309 | MostUsedPos *next; |
---|
310 | |
---|
311 | void swapWith(MostUsedPos *other) { |
---|
312 | std::swap(pos, other->pos); |
---|
313 | std::swap(used, other->used); |
---|
314 | } |
---|
315 | |
---|
316 | public: |
---|
317 | MostUsedPos() : pos(0), used(0), next(NULp) { } |
---|
318 | MostUsedPos(int p) : pos(p), used(1), next(NULp) { } |
---|
319 | ~MostUsedPos() { delete next; } |
---|
320 | |
---|
321 | void clear() { |
---|
322 | pos = 0; |
---|
323 | used = 0; |
---|
324 | delete next; |
---|
325 | next = NULp; |
---|
326 | } |
---|
327 | |
---|
328 | void announce(int p) { |
---|
329 | if (p == pos) used++; |
---|
330 | else { |
---|
331 | if (next) next->announce(p); |
---|
332 | else next = new MostUsedPos(p); |
---|
333 | if (next->used>used) swapWith(next); |
---|
334 | } |
---|
335 | } |
---|
336 | |
---|
337 | |
---|
338 | int get_most_used() const { return pos; } |
---|
339 | }; |
---|
340 | |
---|
341 | class probe_struct_global { |
---|
342 | Stage stage; |
---|
343 | |
---|
344 | union { |
---|
345 | POS_TREE1 *p1; |
---|
346 | POS_TREE2 *p2; |
---|
347 | } pt; |
---|
348 | |
---|
349 | public: |
---|
350 | GB_shell *gb_shell; |
---|
351 | GBDATA *gb_main; // ARBDB interface |
---|
352 | char *alignment_name; |
---|
353 | GB_HASH *namehash; // name to int |
---|
354 | |
---|
355 | int data_count; |
---|
356 | struct probe_input_data *data; // the internal database |
---|
357 | |
---|
358 | char *ecoli; // the ecoli sequenz |
---|
359 | BI_ecoli_ref *bi_ecoli; |
---|
360 | |
---|
361 | int max_size; // maximum sequence len |
---|
362 | long char_count; // number of all 'acgtuACGTU' |
---|
363 | |
---|
364 | int reversed; // tell the matcher whether probe is reversed |
---|
365 | |
---|
366 | double *pos_to_weight; // position to weight |
---|
367 | |
---|
368 | MostUsedPos abs_pos; |
---|
369 | |
---|
370 | int sort_by; |
---|
371 | |
---|
372 | char *main_probe; |
---|
373 | |
---|
374 | char *server_name; // name of this server |
---|
375 | aisc_com *link; |
---|
376 | T_PT_MAIN main; |
---|
377 | Hs_struct *com_so; // the communication socket |
---|
378 | |
---|
379 | probe_statistic_struct stat; |
---|
380 | |
---|
381 | bool big_db; // STAGE2 only (true -> uses 8 bit pointers) |
---|
382 | |
---|
383 | void setup(); |
---|
384 | void cleanup(); |
---|
385 | |
---|
386 | void enter_stage(Stage stage_); |
---|
387 | Stage get_stage() const { return stage; } |
---|
388 | |
---|
389 | POS_TREE1*& TREE_ROOT1() { pt_assert(stage == STAGE1); return pt.p1; } |
---|
390 | POS_TREE2*& TREE_ROOT2() { pt_assert(stage == STAGE2); return pt.p2; } |
---|
391 | }; |
---|
392 | |
---|
393 | extern probe_struct_global psg; |
---|
394 | |
---|
395 | class gene_struct { |
---|
396 | char *gene_name; |
---|
397 | const char *arb_species_name; // pointers into 'gene_name' |
---|
398 | const char *arb_gene_name; |
---|
399 | |
---|
400 | void init(const char *gene_name_, const char *arb_species_name_, const char *arb_gene_name_) { |
---|
401 | int gene_name_len = strlen(gene_name_); |
---|
402 | int arb_species_name_len = strlen(arb_species_name_); |
---|
403 | int arb_gene_name_len = strlen(arb_gene_name_); |
---|
404 | |
---|
405 | int fulllen = gene_name_len+1+arb_species_name_len+1+arb_gene_name_len+1; |
---|
406 | gene_name = new char[fulllen]; |
---|
407 | strcpy(gene_name, gene_name_); |
---|
408 | arb_species_name = gene_name+(gene_name_len+1); |
---|
409 | strcpy((char*)arb_species_name, arb_species_name_); |
---|
410 | arb_gene_name = arb_species_name+(arb_species_name_len+1); |
---|
411 | strcpy((char*)arb_gene_name, arb_gene_name_); |
---|
412 | } |
---|
413 | |
---|
414 | public: |
---|
415 | gene_struct(const char *gene_name_, const char *arb_species_name_, const char *arb_gene_name_) { |
---|
416 | init(gene_name_, arb_species_name_, arb_gene_name_); |
---|
417 | } |
---|
418 | gene_struct(const gene_struct& other) { |
---|
419 | if (&other != this) { |
---|
420 | init(other.get_internal_gene_name(), other.get_arb_species_name(), other.get_arb_gene_name()); |
---|
421 | } |
---|
422 | } |
---|
423 | gene_struct& operator = (const gene_struct& other) { |
---|
424 | if (&other != this) { |
---|
425 | delete [] gene_name; |
---|
426 | init(other.get_internal_gene_name(), other.get_arb_species_name(), other.get_arb_gene_name()); |
---|
427 | } |
---|
428 | return *this; |
---|
429 | } |
---|
430 | |
---|
431 | ~gene_struct() { |
---|
432 | delete [] gene_name; |
---|
433 | } |
---|
434 | |
---|
435 | const char *get_internal_gene_name() const { return gene_name; } |
---|
436 | const char *get_arb_species_name() const { return arb_species_name; } |
---|
437 | const char *get_arb_gene_name() const { return arb_gene_name; } |
---|
438 | }; |
---|
439 | |
---|
440 | struct ltByArbName { |
---|
441 | bool operator()(const gene_struct *gs1, const gene_struct *gs2) const { |
---|
442 | int cmp = strcmp(gs1->get_arb_species_name(), gs2->get_arb_species_name()); |
---|
443 | if (cmp == 0) { cmp = strcmp(gs1->get_arb_gene_name(), gs2->get_arb_gene_name()); } |
---|
444 | return cmp<0; |
---|
445 | } |
---|
446 | }; |
---|
447 | struct ltByInternalName { |
---|
448 | bool operator()(const gene_struct *gs1, const gene_struct *gs2) const { |
---|
449 | int cmp = strcmp(gs1->get_internal_gene_name(), gs2->get_internal_gene_name()); |
---|
450 | return cmp<0; |
---|
451 | } |
---|
452 | }; |
---|
453 | |
---|
454 | typedef std::list<gene_struct> gene_struct_list; |
---|
455 | typedef std::set<const gene_struct *, ltByInternalName> gene_struct_index_internal; |
---|
456 | typedef std::set<const gene_struct *, ltByArbName> gene_struct_index_arb; |
---|
457 | |
---|
458 | extern gene_struct_index_arb gene_struct_arb2internal; // sorted by arb species+gene name |
---|
459 | extern gene_struct_index_internal gene_struct_internal2arb; // sorted by internal name |
---|
460 | |
---|
461 | #else |
---|
462 | #error probe.h included twice |
---|
463 | #endif |
---|
464 | |
---|