1 | #include <stdio.h> |
---|
2 | #include <stdlib.h> |
---|
3 | #include <string.h> |
---|
4 | // #include <malloc.h> |
---|
5 | #include <PT_server.h> |
---|
6 | #include <struct_man.h> |
---|
7 | #include <PT_server_prototypes.h> |
---|
8 | #include "probe.h" |
---|
9 | #include "probe_tree.hxx" |
---|
10 | #include "pt_prototypes.h" |
---|
11 | #include <arbdbt.h> |
---|
12 | |
---|
13 | |
---|
14 | // overloaded functions to avoid problems with type-punning: |
---|
15 | inline void aisc_link(dll_public *dll, PT_family_list *family) { aisc_link(reinterpret_cast<dllpublic_ext*>(dll), reinterpret_cast<dllheader_ext*>(family)); } |
---|
16 | |
---|
17 | /*****************************************************************************/ |
---|
18 | /*****************************************************************************/ |
---|
19 | /*****************************************************************************/ |
---|
20 | /* Increment match_count for a matched chain */ |
---|
21 | |
---|
22 | int mark_all_matches_chain_handle(int name, int /*pos*/, int /*rpos*/, long /*clientdata*/) |
---|
23 | { |
---|
24 | psg.data[name].stat.match_count++; |
---|
25 | return 0; |
---|
26 | } |
---|
27 | /* Increment match_count for every match */ |
---|
28 | int mark_all_matches( PT_local *locs, |
---|
29 | POS_TREE *pt, |
---|
30 | char *probe, |
---|
31 | int length, |
---|
32 | int mismatches, |
---|
33 | int height, |
---|
34 | int max_mismatches) |
---|
35 | { |
---|
36 | int ref_pos, ref2_pos, ref_name; |
---|
37 | int base; |
---|
38 | int type_of_node; |
---|
39 | POS_TREE *pt_son; |
---|
40 | if (pt == NULL || mismatches > max_mismatches) { |
---|
41 | return 0; |
---|
42 | } |
---|
43 | type_of_node = PT_read_type(pt); |
---|
44 | if ((type_of_node = PT_read_type(pt)) != PT_NT_NODE) { |
---|
45 | /* |
---|
46 | * Found unique solution |
---|
47 | */ |
---|
48 | if (type_of_node == PT_NT_LEAF) { |
---|
49 | ref2_pos = PT_read_rpos(psg.ptmain,pt); |
---|
50 | ref_pos = ref2_pos + height; |
---|
51 | ref_name = PT_read_name(psg.ptmain,pt); |
---|
52 | /* |
---|
53 | * Check rest of probe |
---|
54 | */ |
---|
55 | while (mismatches <= max_mismatches && *probe) { |
---|
56 | if (psg.data[ref_name].data[ref_pos++] != *probe) { |
---|
57 | mismatches++; |
---|
58 | } |
---|
59 | probe++; |
---|
60 | height++; |
---|
61 | } |
---|
62 | /* |
---|
63 | * Increment count if probe matches |
---|
64 | */ |
---|
65 | if (mismatches <= max_mismatches) |
---|
66 | psg.data[ref_name].stat.match_count++; |
---|
67 | return 0; |
---|
68 | } else { /* type_of_node == CHAIN !! */ |
---|
69 | psg.mismatches = mismatches; |
---|
70 | psg.height = height; |
---|
71 | psg.length = length; |
---|
72 | psg.probe = probe; |
---|
73 | PT_read_chain(psg.ptmain,pt,mark_all_matches_chain_handle, (long) locs); |
---|
74 | return 0; |
---|
75 | } |
---|
76 | } |
---|
77 | for (base = PT_N; base < PT_B_MAX; base++) { |
---|
78 | if ( (pt_son = PT_read_son(psg.ptmain, pt, (PT_BASES)base))) { |
---|
79 | if (*probe) { |
---|
80 | if (*probe != base) { |
---|
81 | mark_all_matches(locs, pt_son, probe+1, length, |
---|
82 | mismatches + 1, height + 1, max_mismatches); |
---|
83 | } else { |
---|
84 | mark_all_matches(locs, pt_son, probe+1, length, |
---|
85 | mismatches, height + 1, max_mismatches); |
---|
86 | } |
---|
87 | } else { |
---|
88 | mark_all_matches(locs, pt_son, probe, length, |
---|
89 | mismatches, height + 1, max_mismatches); |
---|
90 | } |
---|
91 | } |
---|
92 | } |
---|
93 | return 0; |
---|
94 | } |
---|
95 | |
---|
96 | /* Clear all informations in psg.data[i].stat */ |
---|
97 | void clear_statistic(){ |
---|
98 | int i; |
---|
99 | for (i = 0; i < psg.data_count; i++) |
---|
100 | memset((char *) &psg.data[i].stat,0,sizeof(struct probe_statistic)); |
---|
101 | } |
---|
102 | |
---|
103 | |
---|
104 | /* Calculate the statistic informations for the family */ |
---|
105 | void make_match_statistic(int probe_len){ |
---|
106 | int i; |
---|
107 | /* |
---|
108 | * compute statistic for all species in family |
---|
109 | */ |
---|
110 | for (i = 0; i < psg.data_count; i++) { |
---|
111 | int all_len = psg.data[i].size - probe_len + 1; |
---|
112 | if (all_len <= 0){ |
---|
113 | psg.data[i].stat.rel_match_count = 0; |
---|
114 | }else{ |
---|
115 | psg.data[i].stat.rel_match_count = psg.data[i].stat.match_count / (double) (all_len); |
---|
116 | } |
---|
117 | } |
---|
118 | } |
---|
119 | |
---|
120 | |
---|
121 | |
---|
122 | #ifdef __cplusplus |
---|
123 | extern "C" { |
---|
124 | #endif |
---|
125 | |
---|
126 | /* Compare function for GB_mergesort() */ |
---|
127 | static long compare_probe_input_data0(void *pid_ptr1, void *pid_ptr2, char*) { |
---|
128 | struct probe_input_data *d1 = (struct probe_input_data*)pid_ptr1; |
---|
129 | struct probe_input_data *d2 = (struct probe_input_data*)pid_ptr2; |
---|
130 | |
---|
131 | if (d1->stat.match_count < d2->stat.match_count) return 1; |
---|
132 | if (d1->stat.match_count == d2->stat.match_count) return 0; |
---|
133 | return -1; |
---|
134 | } |
---|
135 | |
---|
136 | static long compare_probe_input_data1(void *pid_ptr1, void *pid_ptr2, char*) { |
---|
137 | struct probe_input_data *d1 = (struct probe_input_data*)pid_ptr1; |
---|
138 | struct probe_input_data *d2 = (struct probe_input_data*)pid_ptr2; |
---|
139 | |
---|
140 | if (d1->stat.rel_match_count < d2->stat.rel_match_count) return 1; |
---|
141 | if (d1->stat.rel_match_count == d2->stat.rel_match_count) return 0; |
---|
142 | return -1; |
---|
143 | } |
---|
144 | |
---|
145 | #ifdef __cplusplus |
---|
146 | } |
---|
147 | #endif |
---|
148 | |
---|
149 | /* Make sortet list of family members */ |
---|
150 | int make_PT_family_list(PT_local *locs) |
---|
151 | { |
---|
152 | struct probe_input_data **my_list; |
---|
153 | PT_family_list *fl; |
---|
154 | int i; |
---|
155 | /* |
---|
156 | * Sort the data |
---|
157 | */ |
---|
158 | my_list = (struct probe_input_data**) calloc(sizeof(void *),psg.data_count); |
---|
159 | for (i = 0; i < psg.data_count; i++) my_list[i] = &psg.data[i]; |
---|
160 | if (locs->sort_type == 0){ |
---|
161 | GB_mergesort((void **) my_list,0,psg.data_count, compare_probe_input_data0,0); |
---|
162 | }else{ |
---|
163 | GB_mergesort((void **) my_list,0,psg.data_count, compare_probe_input_data1,0); |
---|
164 | } |
---|
165 | /* |
---|
166 | * destroy old list |
---|
167 | */ |
---|
168 | while(locs->fl) destroy_PT_family_list(locs->fl); |
---|
169 | /* |
---|
170 | * build new list |
---|
171 | */ |
---|
172 | for (i = 0; i < psg.data_count; i++) { |
---|
173 | if (my_list[i]->stat.match_count == 0) continue; |
---|
174 | fl = create_PT_family_list(); |
---|
175 | fl->name = strdup(my_list[i]->name); |
---|
176 | fl->matches = my_list[i]->stat.match_count; |
---|
177 | fl->rel_matches = my_list[i]->stat.rel_match_count; |
---|
178 | aisc_link(&locs->pfl, fl); |
---|
179 | } |
---|
180 | free((char *)my_list); |
---|
181 | return 0; |
---|
182 | } |
---|
183 | |
---|
184 | /* Check the probe for inconsitencies */ |
---|
185 | int probe_is_ok(char *probe, int probe_len, char first_c, char second_c) |
---|
186 | { |
---|
187 | int i; |
---|
188 | if (probe_len < 2 || probe[0] != first_c || probe[1] != second_c) |
---|
189 | return 0; |
---|
190 | for (i = 2; i < probe_len; i++) |
---|
191 | if (probe[i] < PT_A || probe[i] > PT_T) |
---|
192 | return 0; |
---|
193 | return 1; |
---|
194 | } |
---|
195 | /* make sorted list of family members of species */ |
---|
196 | extern "C" int find_family(PT_local *locs, bytestring *species) |
---|
197 | { |
---|
198 | char first_c, second_c; |
---|
199 | char *sequence, *probe; |
---|
200 | int sequence_len; |
---|
201 | int probe_len = locs->pr_len; |
---|
202 | int mismatch_nr = locs->mis_nr; |
---|
203 | clear_statistic(); |
---|
204 | sequence = species->data; |
---|
205 | sequence_len = probe_compress_sequence(sequence); |
---|
206 | /* |
---|
207 | * search for sorted probes |
---|
208 | */ |
---|
209 | /* |
---|
210 | * find all "*" |
---|
211 | */ |
---|
212 | if (locs->find_type == 0) { |
---|
213 | for (first_c = PT_A; first_c <= PT_T; first_c++) |
---|
214 | for (second_c = PT_A; second_c <= PT_T; second_c++) |
---|
215 | for (probe=sequence;probe<sequence+sequence_len-probe_len;probe++) |
---|
216 | if (probe_is_ok(probe,probe_len,first_c,second_c)) |
---|
217 | mark_all_matches(locs,psg.pt,probe,probe_len,0,0,mismatch_nr); |
---|
218 | } |
---|
219 | /* |
---|
220 | * find only "a*" |
---|
221 | */ |
---|
222 | else { |
---|
223 | first_c = PT_A; |
---|
224 | for (second_c = PT_A; second_c <= PT_T; second_c++) |
---|
225 | for (probe=sequence;probe<sequence+sequence_len-probe_len;probe++) |
---|
226 | if (probe_is_ok(probe,probe_len,first_c,second_c)) |
---|
227 | mark_all_matches(locs,psg.pt,probe,probe_len,0,0,mismatch_nr); |
---|
228 | } |
---|
229 | |
---|
230 | make_match_statistic(locs->pr_len); |
---|
231 | make_PT_family_list(locs); |
---|
232 | free(species->data); |
---|
233 | return 0; |
---|
234 | } |
---|