1 | |
---|
2 | #include <stdio.h> |
---|
3 | #include <stdlib.h> |
---|
4 | #include <string.h> |
---|
5 | #include <PT_server.h> |
---|
6 | #include <PT_server_prototypes.h> |
---|
7 | #include <struct_man.h> |
---|
8 | #include "ptpan.h" |
---|
9 | #include "pt_prototypes.h" |
---|
10 | #include <arbdbt.h> |
---|
11 | #include <math.h> |
---|
12 | |
---|
13 | /* /// "SearchPartition()" */ |
---|
14 | void SearchPartition(struct PTPanPartition *pp, struct SearchQuery *sq) |
---|
15 | { |
---|
16 | struct PTPanGlobal *pg = pp->pp_PTPanGlobal; |
---|
17 | struct SearchQuery *tmpsq; |
---|
18 | |
---|
19 | /* do search on this partition */ |
---|
20 | tmpsq = CloneSearchQuery(sq); |
---|
21 | tmpsq->sq_PTPanPartition = pp; |
---|
22 | tmpsq->sq_SourceSeq = pp->pp_PrefixSeq; |
---|
23 | |
---|
24 | if (PTPanGlobalPtr->pg_verbose >0) |
---|
25 | printf("== SearchPartition: for %s\n", sq->sq_Query); |
---|
26 | |
---|
27 | if(MatchSequence(tmpsq)) |
---|
28 | { |
---|
29 | if(!(pp->pp_CacheNode = CacheLoadData(pg->pg_PartitionCache, pp->pp_CacheNode, pp))) |
---|
30 | { |
---|
31 | return; /* something went wrong while loading */ |
---|
32 | } |
---|
33 | SearchTree(tmpsq); |
---|
34 | PostFilterQueryHits(tmpsq); |
---|
35 | MergeQueryHits(sq, tmpsq); /* needs semaphore protection on parallel runs */ |
---|
36 | } |
---|
37 | pp->pp_Done = TRUE; |
---|
38 | FreeSearchQuery(tmpsq); |
---|
39 | } |
---|
40 | /* \\\ */ |
---|
41 | |
---|
42 | #ifdef BENCHMARK |
---|
43 | /* /// "QueryTests()" */ |
---|
44 | void QueryTests(struct PTPanGlobal *pg) |
---|
45 | { |
---|
46 | PT_local *locs; |
---|
47 | STRPTR ecoli; |
---|
48 | ULONG ecolilen; |
---|
49 | ULONG pos; |
---|
50 | ULONG qlen; |
---|
51 | char buf[32]; |
---|
52 | |
---|
53 | locs = (PT_local *) calloc(1, sizeof(PT_local)); |
---|
54 | ecoli = FilterSequence(pg, pg->pg_EcoliSeq); |
---|
55 | ecolilen = strlen(ecoli); |
---|
56 | locs->pm_max = 0; /* exact search */ |
---|
57 | locs->pm_complement = 0; |
---|
58 | locs->pm_reversed = 0; |
---|
59 | locs->sort_by = SORT_HITS_WEIGHTED; |
---|
60 | |
---|
61 | //qlen = 18; |
---|
62 | for(locs->pm_max = 4; locs->pm_max < 5; locs->pm_max++) |
---|
63 | { |
---|
64 | // for(qlen = 20; qlen - locs->pm_max >= 10; qlen--) |
---|
65 | for(qlen = 9 + locs->pm_max; qlen >= 10; qlen--) |
---|
66 | // for(qlen = 31; qlen >= 16; qlen--) |
---|
67 | { |
---|
68 | pg->pg_Bench.ts_Hits = 0; |
---|
69 | pg->pg_Bench.ts_UnsafeHits = 0; |
---|
70 | pg->pg_Bench.ts_UnsafeKilled = 0; |
---|
71 | pg->pg_Bench.ts_DupsKilled = 0; |
---|
72 | pg->pg_Bench.ts_CrossBoundKilled = 0; |
---|
73 | pg->pg_Bench.ts_DotsKilled = 0; |
---|
74 | pg->pg_Bench.ts_OutHits = 0; |
---|
75 | pg->pg_Bench.ts_CandSetTime= 0; |
---|
76 | pg->pg_Bench.ts_OutputTime = 0; |
---|
77 | |
---|
78 | for(pos = 0; pos < ecolilen - qlen; pos += 2) |
---|
79 | { |
---|
80 | strncpy(buf, &ecoli[pos], qlen); |
---|
81 | buf[qlen] = 0; |
---|
82 | probe_match(locs, strdup(buf)); |
---|
83 | } |
---|
84 | printf("qDAT: (queries qlen err hits gentime unsafe unkill dupskill crosskill dotskill outhits outtime)\n"); |
---|
85 | printf("%ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %s QDAT\n", |
---|
86 | pos, qlen, locs->pm_max, |
---|
87 | pg->pg_Bench.ts_Hits, |
---|
88 | pg->pg_Bench.ts_CandSetTime, |
---|
89 | pg->pg_Bench.ts_UnsafeHits, |
---|
90 | pg->pg_Bench.ts_UnsafeKilled, |
---|
91 | pg->pg_Bench.ts_DupsKilled, |
---|
92 | pg->pg_Bench.ts_CrossBoundKilled, |
---|
93 | pg->pg_Bench.ts_DotsKilled, |
---|
94 | pg->pg_Bench.ts_OutHits, |
---|
95 | pg->pg_Bench.ts_OutputTime, |
---|
96 | pg->pg_DBName); |
---|
97 | fflush(stdout); |
---|
98 | } |
---|
99 | } |
---|
100 | printf("Done\n"); |
---|
101 | free(ecoli); |
---|
102 | free(locs); |
---|
103 | } |
---|
104 | /* \\\ */ |
---|
105 | #endif |
---|
106 | |
---|
107 | |
---|
108 | void PP_convertBondMatrix(PT_pdc *pdc, PTPanGlobal *pg) |
---|
109 | { |
---|
110 | for (int query = SEQCODE_A; query <= SEQCODE_T; ++query) { |
---|
111 | for (int species = SEQCODE_A; species <= SEQCODE_T; ++species) { |
---|
112 | int rowIdx = (pg->pg_ComplementTable[query] - SEQCODE_A)*4; |
---|
113 | int maxIdx = rowIdx + query - SEQCODE_A; |
---|
114 | int newIdx = rowIdx + species - SEQCODE_A; |
---|
115 | |
---|
116 | double max_bind = pdc->bond[maxIdx].val; |
---|
117 | double new_bind = pdc->bond[newIdx].val; |
---|
118 | |
---|
119 | pg->pg_MismatchWeights.mw_Replace[query * ALPHASIZE + species] = max_bind - new_bind; |
---|
120 | } |
---|
121 | } |
---|
122 | #if defined(DEBUG) |
---|
123 | printf("Current bond values:\n"); |
---|
124 | for (int y = 0; y<4; y++) { |
---|
125 | for (int x = 0; x<4; x++) { |
---|
126 | printf("%5.2f", pdc->bond[y*4+x].val); |
---|
127 | } |
---|
128 | printf("\n"); |
---|
129 | } |
---|
130 | printf("Current Replace Matrix:\n"); |
---|
131 | for (int query = SEQCODE_A; query <= SEQCODE_T; ++query) { |
---|
132 | for (int species = SEQCODE_A; species <= SEQCODE_T; ++species) { |
---|
133 | printf("%5.2f", pg->pg_MismatchWeights.mw_Replace[query * ALPHASIZE + species]); |
---|
134 | } |
---|
135 | printf("\n"); |
---|
136 | } |
---|
137 | #endif // DEBUG |
---|
138 | } |
---|
139 | |
---|
140 | |
---|
141 | static double PP_calc_position_wmis(int pos, int seq_len, double y1, double y2) |
---|
142 | { |
---|
143 | return (double)(((double)(pos * (seq_len - 1 - pos)) / (double)((seq_len - 1) * (seq_len - 1)))* (double)(y2*4.0) + y1); |
---|
144 | } |
---|
145 | |
---|
146 | |
---|
147 | void PP_buildPosWeight(SearchQuery *sq) |
---|
148 | { |
---|
149 | if (sq->sq_PosWeight) delete[] sq->sq_PosWeight; |
---|
150 | //printf("buildPosWeight: ...new double[%i];\n", sq->sq_QueryLen+1); |
---|
151 | sq->sq_PosWeight = new double[sq->sq_QueryLen+1]; // TODO: check if +1 is necessary |
---|
152 | |
---|
153 | for (int pos=0; pos < sq->sq_QueryLen; ++pos) { |
---|
154 | if (sq->sq_SortMode == SORT_HITS_WEIGHTED) { |
---|
155 | sq->sq_PosWeight[pos] = PP_calc_position_wmis(pos, sq->sq_QueryLen, 0.3, 1.0); |
---|
156 | }else{ |
---|
157 | sq->sq_PosWeight[pos] = 1.0; |
---|
158 | } |
---|
159 | } |
---|
160 | sq->sq_PosWeight[sq->sq_QueryLen] = 0.0; // TODO: check if last pos is necessary |
---|
161 | #if defined(DEBUG) |
---|
162 | printf("sq_Posweight[]: "); |
---|
163 | for (int pos=0; pos < sq->sq_QueryLen; ++pos) { |
---|
164 | printf("%f, ", sq->sq_PosWeight[pos]); |
---|
165 | } |
---|
166 | printf("%f\n", sq->sq_PosWeight[sq->sq_QueryLen]); |
---|
167 | #endif |
---|
168 | } |
---|
169 | |
---|
170 | |
---|
171 | /* /// "probe_match()" */ |
---|
172 | extern "C" int probe_match(PT_local *locs, aisc_string probestring) |
---|
173 | { |
---|
174 | struct PTPanGlobal *pg = PTPanGlobalPtr; |
---|
175 | struct PTPanPartition *pp; |
---|
176 | struct SearchQuery *sq; |
---|
177 | struct SearchQuery *compsq = NULL; |
---|
178 | PT_probematch *ml; |
---|
179 | |
---|
180 | pg->pg_SearchPrefs = locs; |
---|
181 | |
---|
182 | PP_convertBondMatrix(locs->pdc, pg); |
---|
183 | |
---|
184 | /* find out where a given probe matches */ |
---|
185 | if (PTPanGlobalPtr->pg_verbose >0) { |
---|
186 | printf("Search request for %s (errs = %d, compl = %d, rev = %d, weight = %d)\n", |
---|
187 | probestring, |
---|
188 | pg->pg_SearchPrefs->pm_max, |
---|
189 | pg->pg_SearchPrefs->pm_complement, |
---|
190 | pg->pg_SearchPrefs->pm_reversed, |
---|
191 | pg->pg_SearchPrefs->sort_by); |
---|
192 | } |
---|
193 | |
---|
194 | /* free the old sequence */ |
---|
195 | if(pg->pg_SearchPrefs->pm_sequence) |
---|
196 | { |
---|
197 | free(pg->pg_SearchPrefs->pm_sequence); |
---|
198 | } |
---|
199 | pg->pg_SearchPrefs->pm_sequence = FilterSequence(pg, probestring); |
---|
200 | |
---|
201 | /* do we need to check the complement instead of the normal one? */ |
---|
202 | if(pg->pg_SearchPrefs->pm_complement) |
---|
203 | { |
---|
204 | ComplementSequence(pg, pg->pg_SearchPrefs->pm_sequence); |
---|
205 | } |
---|
206 | |
---|
207 | /* do we need to look at the reversed sequence as well? */ |
---|
208 | if(pg->pg_SearchPrefs->pm_reversed) |
---|
209 | { |
---|
210 | if(pg->pg_SearchPrefs->pm_csequence) |
---|
211 | { |
---|
212 | free(pg->pg_SearchPrefs->pm_csequence); |
---|
213 | } |
---|
214 | pg->pg_SearchPrefs->pm_csequence = strdup(pg->pg_SearchPrefs->pm_sequence); |
---|
215 | ReverseSequence(pg, pg->pg_SearchPrefs->pm_csequence); |
---|
216 | ComplementSequence(pg, pg->pg_SearchPrefs->pm_csequence); |
---|
217 | } |
---|
218 | |
---|
219 | //psg.main_probe = strdup(probestring); |
---|
220 | |
---|
221 | /* clear all old matches */ |
---|
222 | while((ml = pg->pg_SearchPrefs->pm)) |
---|
223 | { |
---|
224 | destroy_PT_probematch(ml); |
---|
225 | } |
---|
226 | |
---|
227 | #if 1 |
---|
228 | /* check, if the probe string is too short */ |
---|
229 | if(strlen(pg->pg_SearchPrefs->pm_sequence) + |
---|
230 | (2 * pg->pg_SearchPrefs->pm_max) < MIN_PROBE_LENGTH) |
---|
231 | { |
---|
232 | SetARBErrorMsg(pg->pg_SearchPrefs, (STRPTR) "error: probe too short!!\n"); |
---|
233 | free(probestring); |
---|
234 | return(0); |
---|
235 | } |
---|
236 | #endif |
---|
237 | |
---|
238 | /* allocate query that configures and holds all the merged results */ |
---|
239 | sq = AllocSearchQuery(pg); |
---|
240 | |
---|
241 | /* prefs */ |
---|
242 | sq->sq_Query = (STRPTR) pg->pg_SearchPrefs->pm_sequence; |
---|
243 | sq->sq_QueryLen = strlen(sq->sq_Query); |
---|
244 | sq->sq_MaxErrors = (float) pg->pg_SearchPrefs->pm_max; |
---|
245 | sq->sq_Reversed = FALSE; |
---|
246 | sq->sq_AllowReplace = TRUE; |
---|
247 | sq->sq_AllowInsert = TRUE; |
---|
248 | sq->sq_AllowDelete = TRUE; |
---|
249 | sq->sq_KillNSeqsAt = strlen(sq->sq_Query) / 3; |
---|
250 | sq->sq_MinorMisThres = pg->pg_SearchPrefs->pdc->split; |
---|
251 | sq->sq_SortMode = pg->pg_SearchPrefs->sort_by; |
---|
252 | |
---|
253 | /* init */ |
---|
254 | sq->sq_PTPanPartition = NULL; |
---|
255 | PP_buildPosWeight(sq); |
---|
256 | if(pg->pg_SearchPrefs->sort_by) |
---|
257 | { |
---|
258 | /* user requested weighted searching */ |
---|
259 | sq->sq_MismatchWeights = &sq->sq_PTPanGlobal->pg_MismatchWeights; |
---|
260 | } else { |
---|
261 | /* user wants unified searching */ |
---|
262 | sq->sq_MismatchWeights = &sq->sq_PTPanGlobal->pg_NoWeights; |
---|
263 | } |
---|
264 | |
---|
265 | /* do we need to do a second query on the complement? */ |
---|
266 | if(pg->pg_SearchPrefs->pm_reversed) |
---|
267 | { |
---|
268 | compsq = CloneSearchQuery(sq); |
---|
269 | compsq->sq_Query = (STRPTR) pg->pg_SearchPrefs->pm_csequence; |
---|
270 | compsq->sq_Reversed = TRUE; |
---|
271 | } |
---|
272 | |
---|
273 | /* start time here */ |
---|
274 | #ifdef BENCHMARK |
---|
275 | if (PTPanGlobalPtr->pg_verbose >0) |
---|
276 | BenchTimePassed(pg); |
---|
277 | #endif |
---|
278 | |
---|
279 | /* search over partitions that are still in cache */ |
---|
280 | pp = (struct PTPanPartition *) pg->pg_Partitions.lh_Head; |
---|
281 | while(pp->pp_Node.ln_Succ) |
---|
282 | { |
---|
283 | pp->pp_Done = FALSE; |
---|
284 | if(CacheDataLoaded(pp->pp_CacheNode)) |
---|
285 | { |
---|
286 | /* search normal */ |
---|
287 | SearchPartition(pp, sq); |
---|
288 | /* and optionally, search complement */ |
---|
289 | if(compsq) |
---|
290 | { |
---|
291 | SearchPartition(pp, compsq); |
---|
292 | } |
---|
293 | } |
---|
294 | pp = (struct PTPanPartition *) pp->pp_Node.ln_Succ; |
---|
295 | } |
---|
296 | |
---|
297 | /* search over all partitions not done yet */ |
---|
298 | pp = (struct PTPanPartition *) pg->pg_Partitions.lh_Head; |
---|
299 | while(pp->pp_Node.ln_Succ) |
---|
300 | { |
---|
301 | if(!pp->pp_Done) |
---|
302 | { |
---|
303 | /* search normal */ |
---|
304 | SearchPartition(pp, sq); |
---|
305 | /* and optionally, search complement */ |
---|
306 | if(compsq) |
---|
307 | { |
---|
308 | SearchPartition(pp, compsq); |
---|
309 | } |
---|
310 | } |
---|
311 | pp = (struct PTPanPartition *) pp->pp_Node.ln_Succ; |
---|
312 | } |
---|
313 | |
---|
314 | #ifdef BENCHMARK |
---|
315 | if (PTPanGlobalPtr->pg_verbose >0) |
---|
316 | pg->pg_Bench.ts_CandSetTime += BenchTimePassed(pg); |
---|
317 | #endif |
---|
318 | |
---|
319 | SortHitsList(sq); |
---|
320 | CreateHitsGUIList(sq); |
---|
321 | FreeSearchQuery(sq); |
---|
322 | if(compsq) |
---|
323 | { |
---|
324 | SortHitsList(compsq); |
---|
325 | CreateHitsGUIList(compsq); |
---|
326 | FreeSearchQuery(compsq); |
---|
327 | } |
---|
328 | |
---|
329 | #ifdef BENCHMARK |
---|
330 | if (PTPanGlobalPtr->pg_verbose >0) |
---|
331 | pg->pg_Bench.ts_OutputTime += BenchTimePassed(pg); |
---|
332 | #endif |
---|
333 | |
---|
334 | free(probestring); /* I actually don't know, if this is required */ |
---|
335 | return 0; |
---|
336 | } |
---|
337 | /* \\\ */ |
---|
338 | |
---|
339 | /* /// "SortHitsList()" */ |
---|
340 | void SortHitsList(struct SearchQuery *sq) |
---|
341 | { |
---|
342 | //struct PTPanGlobal *pg = sq->sq_PTPanGlobal; |
---|
343 | struct QueryHit *qh; |
---|
344 | |
---|
345 | /* enter priority and sort */ |
---|
346 | qh = (struct QueryHit *) sq->sq_Hits.lh_Head; |
---|
347 | if(sq->sq_SortMode == SORT_HITS_NOWEIGHT) |
---|
348 | { |
---|
349 | /* sorting criteria: |
---|
350 | - normal/composite (1 bit) |
---|
351 | - replace only or insert/delete (1 bit) |
---|
352 | - mismatch count (5 bits) |
---|
353 | - error count (8 bits) |
---|
354 | - species (20 bits) |
---|
355 | - absolute position (28 bits) |
---|
356 | */ |
---|
357 | //printf("Sort no weight...\n"); |
---|
358 | while(qh->qh_Node.ln_Succ) |
---|
359 | { |
---|
360 | arb_assert(((LLONG) (qh->qh_ReplaceCount + qh->qh_InsertCount + qh->qh_DeleteCount)) <= 0x1f); // 5 bit |
---|
361 | arb_assert(((LLONG) round(qh->qh_ErrorCount * 10.0)) <= 0xff); // 8 bit |
---|
362 | arb_assert(((LLONG) qh->qh_Species->ps_Num) <= 0xfffff); // 20 bit |
---|
363 | arb_assert(((LLONG) (qh->qh_AbsPos - qh->qh_Species->ps_AbsOffset)) <= 0xfffffff); // 28 bit |
---|
364 | qh->qh_Node.ln_Pri = (LLONG) |
---|
365 | ((qh->qh_Flags & QHF_REVERSED) ? (1LL << 62) : 0LL) + |
---|
366 | ((qh->qh_InsertCount | qh->qh_DeleteCount) ? (1LL << 61) : 0LL) + |
---|
367 | (((LLONG) (qh->qh_ReplaceCount + qh->qh_InsertCount + qh->qh_DeleteCount)) << 56) + |
---|
368 | (((LLONG) round(qh->qh_ErrorCount * 10.0)) << 48) + |
---|
369 | (((LLONG) qh->qh_Species->ps_Num) << 28) + |
---|
370 | ((LLONG) (qh->qh_AbsPos - qh->qh_Species->ps_AbsOffset)); |
---|
371 | qh = (struct QueryHit *) qh->qh_Node.ln_Succ; |
---|
372 | } |
---|
373 | } else { |
---|
374 | //printf("Sort with weight...\n"); |
---|
375 | while(qh->qh_Node.ln_Succ) |
---|
376 | { |
---|
377 | arb_assert(((LLONG) (qh->qh_ReplaceCount + qh->qh_InsertCount + qh->qh_DeleteCount)) <= 0x1f); // 5 bit |
---|
378 | arb_assert(((LLONG) round(qh->qh_ErrorCount * 10.0)) <= 0xff); // 8 bit |
---|
379 | arb_assert(((LLONG) qh->qh_Species->ps_Num) <= 0xfffff); // 20 bit |
---|
380 | arb_assert(((LLONG) (qh->qh_AbsPos - qh->qh_Species->ps_AbsOffset)) <= 0xfffffff); // 28 bit |
---|
381 | qh->qh_Node.ln_Pri = (LLONG) |
---|
382 | ((LLONG) (qh->qh_Flags & QHF_REVERSED) ? (1LL << 62) : 0LL) + |
---|
383 | ((LLONG) (qh->qh_InsertCount | qh->qh_DeleteCount) ? (1LL << 61) : 0LL) + |
---|
384 | (((LLONG) round(qh->qh_ErrorCount * 10.0)) << 53) + |
---|
385 | (((LLONG) (qh->qh_ReplaceCount + qh->qh_InsertCount + qh->qh_DeleteCount)) << 48) + |
---|
386 | ((LLONG) qh->qh_Species->ps_Num << 28) + |
---|
387 | ((LLONG) (qh->qh_AbsPos - qh->qh_Species->ps_AbsOffset)); |
---|
388 | //printf("%16llx\n", qh->qh_Node.ln_Pri); |
---|
389 | qh = (struct QueryHit *) qh->qh_Node.ln_Succ; |
---|
390 | } |
---|
391 | } |
---|
392 | SortList(&sq->sq_Hits); |
---|
393 | #if 0 |
---|
394 | qh = (struct QueryHit *) sq->sq_Hits.lh_Head; |
---|
395 | //printf("... and after\n"); |
---|
396 | while(qh->qh_Node.ln_Succ) |
---|
397 | { |
---|
398 | //printf("%16llx\n", qh->qh_Node.ln_Pri); |
---|
399 | qh = (struct QueryHit *) qh->qh_Node.ln_Succ; |
---|
400 | } |
---|
401 | #endif |
---|
402 | } |
---|
403 | /* \\\ */ |
---|
404 | |
---|
405 | |
---|
406 | /* /// "CreateHitsGUIList()" */ |
---|
407 | void CreateHitsGUIList(struct SearchQuery *sq) |
---|
408 | { |
---|
409 | struct PTPanGlobal *pg = sq->sq_PTPanGlobal; |
---|
410 | struct PTPanSpecies *ps; |
---|
411 | struct QueryHit *qh; |
---|
412 | STRPTR srcptr; |
---|
413 | STRPTR tarptr; |
---|
414 | ULONG maxlen; |
---|
415 | float minweight; |
---|
416 | ULONG cnt; |
---|
417 | ULONG numhits; |
---|
418 | ULONG tarlen; |
---|
419 | |
---|
420 | if (PTPanGlobalPtr->pg_verbose >0) printf(">> CreateHitsGUIList\n"); |
---|
421 | |
---|
422 | /* calculate maximum size of string that we have to examine */ |
---|
423 | minweight = sq->sq_MismatchWeights->mw_Delete[0]; |
---|
424 | for(cnt = 1; cnt < pg->pg_AlphaSize; cnt++) |
---|
425 | { |
---|
426 | if(sq->sq_MismatchWeights->mw_Delete[cnt] < minweight) |
---|
427 | { |
---|
428 | minweight = sq->sq_MismatchWeights->mw_Delete[cnt]; |
---|
429 | } |
---|
430 | } |
---|
431 | maxlen = sq->sq_QueryLen + (ULONG) ((sq->sq_MaxErrors + minweight) / minweight); |
---|
432 | sq->sq_SourceSeq = (STRPTR) malloc(maxlen + 1); |
---|
433 | |
---|
434 | numhits = 0; |
---|
435 | pg->pg_SpeciesCache->ch_SwapCount = 0; |
---|
436 | |
---|
437 | qh = (struct QueryHit *) sq->sq_Hits.lh_Head; |
---|
438 | while(qh->qh_Node.ln_Succ) |
---|
439 | { |
---|
440 | LONG relpos; |
---|
441 | BOOL good; |
---|
442 | ULONG nmismatch; |
---|
443 | UBYTE code; |
---|
444 | UBYTE seqcode; |
---|
445 | |
---|
446 | STRPTR seqout; |
---|
447 | STRPTR seqptr; |
---|
448 | LONG relposcnt; |
---|
449 | PT_probematch *ml; |
---|
450 | |
---|
451 | good = TRUE; |
---|
452 | ps = qh->qh_Species; |
---|
453 | char prefix[10], postfix[10]; |
---|
454 | for (int i = 0; i < 9; ++i) |
---|
455 | { |
---|
456 | prefix[i] = '>'; |
---|
457 | postfix[i] = '<'; |
---|
458 | } |
---|
459 | prefix[9] = postfix[9] = 0x00; |
---|
460 | |
---|
461 | relpos = 0; |
---|
462 | nmismatch = 0; |
---|
463 | ULONG abspos = qh->qh_AbsPos - ps->ps_AbsOffset; |
---|
464 | ULONG bitpos = 0; |
---|
465 | ULONG count; |
---|
466 | /* given an absolute sequence position, search for the relative one, |
---|
467 | e.g. abspos 2 on "-----UU-C-C" will yield 8 |
---|
468 | abspos: 01 2 3 |
---|
469 | relpos: 0123456789a */ |
---|
470 | /* |
---|
471 | if (strcmp(ps->ps_Name, "BclSp114") == 0) |
---|
472 | { |
---|
473 | printf("qh_AbsPos: %li\t\tps_AbsOffset: %li\t\tabspos:%li\n", |
---|
474 | qh->qh_AbsPos, ps->ps_AbsOffset, abspos); |
---|
475 | while ((code = GetNextCharacter(pg, ps->ps_SeqDataCompressed, bitpos, count)) != 0xff) |
---|
476 | { |
---|
477 | if (count > 1) printf("%li%c", count, code); |
---|
478 | else printf("%c", code); |
---|
479 | } |
---|
480 | printf("\n"); |
---|
481 | bitpos = 0; |
---|
482 | } |
---|
483 | */ |
---|
484 | while (bitpos < ps->ps_SeqDataCompressedSize) // get relpos and store prefix |
---|
485 | { |
---|
486 | code = GetNextCharacter(pg, ps->ps_SeqDataCompressed, bitpos, count); |
---|
487 | |
---|
488 | if (pg->pg_SeqCodeValidTable[code]) |
---|
489 | { // it's a valid char |
---|
490 | if (!(abspos--)) break; // position found |
---|
491 | if (abspos <= 8) prefix[8-abspos] = code; // store prefix |
---|
492 | ++relpos; |
---|
493 | } else |
---|
494 | { // it's not a valid char |
---|
495 | arb_assert((code == '.') || (code == '-')); |
---|
496 | |
---|
497 | #ifdef ALLOWDOTSINMATCH |
---|
498 | if ((code == '.') && (count == 1)) // check for dots in match |
---|
499 | { |
---|
500 | if (!(abspos--)) break; // position found |
---|
501 | if (abspos <= 8) prefix[8-abspos] = code; // store prefix |
---|
502 | } |
---|
503 | #endif |
---|
504 | relpos += count; |
---|
505 | if ((code == '.') && (abspos <= 9)) // fill prefix with '.' |
---|
506 | { // TODO: decide if we really want to fill the |
---|
507 | for (int i = 0; i < (9 - abspos); ++i) // whole prefix or just 'count' dots |
---|
508 | { |
---|
509 | prefix[i] = '.'; |
---|
510 | } |
---|
511 | } |
---|
512 | } |
---|
513 | } |
---|
514 | arb_assert(bitpos < ps->ps_SeqDataCompressedSize); |
---|
515 | bitpos -= 3; // bitpos now points to the first character of found seq |
---|
516 | //printf("%c, %li, %li, %li\n", code, ((unsigned long)code), count, bitpos); |
---|
517 | tarlen = sq->sq_QueryLen - qh->qh_DeleteCount + qh->qh_InsertCount; |
---|
518 | for (cnt = 0; cnt < tarlen;) // copy found string into sq->sq_SourceSeq[] |
---|
519 | { |
---|
520 | if (bitpos >= ps->ps_SeqDataCompressedSize) |
---|
521 | { |
---|
522 | arb_assert(false); |
---|
523 | good = FALSE; |
---|
524 | break; |
---|
525 | } |
---|
526 | |
---|
527 | code = GetNextCharacter(pg, ps->ps_SeqDataCompressed, bitpos, count); |
---|
528 | if (pg->pg_SeqCodeValidTable[code]) // valid character |
---|
529 | { |
---|
530 | sq->sq_SourceSeq[cnt++] = code; |
---|
531 | if (code == 'N') nmismatch++; |
---|
532 | } else |
---|
533 | { |
---|
534 | if (code == '.') // if we got a dot in sequence |
---|
535 | { // the hit is bogus |
---|
536 | #ifdef ALLOWDOTSINMATCH |
---|
537 | if (count == 1) |
---|
538 | { |
---|
539 | sq->sq_SourceSeq[cnt++] = '.'; |
---|
540 | } else |
---|
541 | #endif |
---|
542 | { |
---|
543 | pg->pg_Bench.ts_DotsKilled++; |
---|
544 | good = FALSE; |
---|
545 | break; |
---|
546 | } |
---|
547 | } |
---|
548 | } |
---|
549 | } |
---|
550 | sq->sq_SourceSeq[tarlen] = 0; |
---|
551 | if(nmismatch == tarlen) good = FALSE; |
---|
552 | |
---|
553 | if(good) |
---|
554 | { |
---|
555 | /* we need to verify the hit? */ |
---|
556 | if(qh->qh_Flags & QHF_UNSAFE) |
---|
557 | { |
---|
558 | pg->pg_Bench.ts_UnsafeHits++; |
---|
559 | |
---|
560 | good = MatchSequence(sq); |
---|
561 | if(!good) |
---|
562 | { |
---|
563 | pg->pg_Bench.ts_UnsafeKilled++; |
---|
564 | //printf("Verify failed on %s != %s\n", sq->sq_SourceSeq, sq->sq_Query); |
---|
565 | qh->qh_Flags &= ~QHF_ISVALID; |
---|
566 | } else { |
---|
567 | /* fill in correct match */ |
---|
568 | qh->qh_ErrorCount = sq->sq_State.sqs_ErrorCount; |
---|
569 | qh->qh_ReplaceCount = sq->sq_State.sqs_ReplaceCount; |
---|
570 | qh->qh_InsertCount = sq->sq_State.sqs_InsertCount; |
---|
571 | qh->qh_DeleteCount = sq->sq_State.sqs_DeleteCount; |
---|
572 | } |
---|
573 | //qh->qh_Flags &= ~QHF_UNSAFE; |
---|
574 | } |
---|
575 | } //else printf("'.'-Sequence!\n"); |
---|
576 | |
---|
577 | if(good) |
---|
578 | { |
---|
579 | seqout = (STRPTR) calloc(9 + 1 + sq->sq_QueryLen + 1 + 9 + 1, 0x01); |
---|
580 | strncpy(seqout, prefix, 0x09); // copy prefix |
---|
581 | seqout[9] = '-'; // 1st delimiter |
---|
582 | good = FindSequenceMatch(sq, qh, &seqout[10]); // generate mismatch sequence */ |
---|
583 | seqout[10 + sq->sq_QueryLen] = '-'; // 2nd delimiter |
---|
584 | if (!good) free(seqout); |
---|
585 | } |
---|
586 | |
---|
587 | if (good) |
---|
588 | { |
---|
589 | for (cnt = 0; cnt < 9;) // generate postfix |
---|
590 | { |
---|
591 | code = GetNextCharacter(pg, ps->ps_SeqDataCompressed, bitpos, count); |
---|
592 | if (code == 0xff) break; |
---|
593 | if (pg->pg_SeqCodeValidTable[code]) // valid character |
---|
594 | { |
---|
595 | postfix[cnt++] = code; |
---|
596 | } else if (code == '.') // '.' found |
---|
597 | { |
---|
598 | for (; cnt < 9; ++cnt) // fill postfix with '.' |
---|
599 | { // TODO: decide if we really want to fill the |
---|
600 | postfix[cnt] = '.'; // whole postfix or just 'count' dots |
---|
601 | } |
---|
602 | } |
---|
603 | } |
---|
604 | |
---|
605 | strncpy(&seqout[11 + sq->sq_QueryLen], postfix, 0x09); // copy postfix |
---|
606 | |
---|
607 | ml = create_PT_probematch(); |
---|
608 | ml->name = qh->qh_Species->ps_Num; |
---|
609 | ml->b_pos = relpos; |
---|
610 | ml->rpos = qh->qh_AbsPos - ps->ps_AbsOffset; |
---|
611 | ml->wmismatches = (double) qh->qh_ErrorCount; |
---|
612 | ml->mismatches = qh->qh_ReplaceCount + qh->qh_InsertCount + qh->qh_DeleteCount; |
---|
613 | ml->N_mismatches = nmismatch; |
---|
614 | ml->sequence = seqout; /* warning! potentional memory leak -- FIX destroy_PT_probematch(ml) */ |
---|
615 | ml->reversed = (qh->qh_Flags & QHF_REVERSED) ? 1 : 0; |
---|
616 | |
---|
617 | aisc_link((struct_dllpublic_ext *) &(pg->pg_SearchPrefs->ppm), (struct_dllheader_ext *) ml); |
---|
618 | numhits++; |
---|
619 | |
---|
620 | if (PTPanGlobalPtr->pg_verbose >0) printf("SeqOut: '%s'\n", seqout); |
---|
621 | } |
---|
622 | |
---|
623 | RemQueryHit(qh); |
---|
624 | qh = (struct QueryHit *) sq->sq_Hits.lh_Head; |
---|
625 | |
---|
626 | } // while(qh->qh_Node.ln_Succ) |
---|
627 | free(sq->sq_SourceSeq); |
---|
628 | |
---|
629 | if (PTPanGlobalPtr->pg_verbose >0) { |
---|
630 | pg->pg_Bench.ts_OutHits += numhits; |
---|
631 | printf("<< CreateHitsGUIList: Number of hits %ld (SwapCount %ld)\n", |
---|
632 | numhits, pg->pg_SpeciesCache->ch_SwapCount); |
---|
633 | } |
---|
634 | } |
---|
635 | /* \\\ */ |
---|
636 | |
---|
637 | /* /// "get_match_info()" */ |
---|
638 | extern "C" STRPTR get_match_info(PT_probematch *ml) |
---|
639 | { |
---|
640 | struct PTPanGlobal *pg = PTPanGlobalPtr; |
---|
641 | struct PTPanSpecies *ps; |
---|
642 | ULONG ecolipos = 0; |
---|
643 | |
---|
644 | /* calculate ecoli position in O(1) */ |
---|
645 | if(pg->pg_EcoliBaseTable) |
---|
646 | { |
---|
647 | if((ULONG) ml->b_pos < pg->pg_EcoliSeqSize) |
---|
648 | { |
---|
649 | ecolipos = pg->pg_EcoliBaseTable[ml->b_pos]; |
---|
650 | } else { |
---|
651 | ecolipos = pg->pg_EcoliBaseTable[pg->pg_EcoliSeqSize]; |
---|
652 | } |
---|
653 | } |
---|
654 | ps = pg->pg_SpeciesMap[ml->name]; |
---|
655 | sprintf(pg->pg_TempBuffer, "%10s %-30.30s %2d %2d %1.1f %7d %4ld %1d %s", |
---|
656 | ps->ps_Name, ps->ps_FullName, |
---|
657 | ml->mismatches, ml->N_mismatches, ml->wmismatches, |
---|
658 | ml->b_pos, ecolipos, |
---|
659 | ml->reversed, ml->sequence); |
---|
660 | |
---|
661 | if (PTPanGlobalPtr->pg_verbose >0) printf("== get_match_info: %s\n", pg->pg_TempBuffer); |
---|
662 | |
---|
663 | return(pg->pg_TempBuffer); |
---|
664 | } |
---|
665 | /* \\\ */ |
---|
666 | |
---|
667 | /* /// "GetMatchListHeader()" */ |
---|
668 | STRPTR GetMatchListHeader(STRPTR seq) |
---|
669 | { |
---|
670 | STRPTR res; |
---|
671 | |
---|
672 | if(seq) |
---|
673 | { |
---|
674 | res = (STRPTR) GBS_global_string(" name fullname " |
---|
675 | "mis N_mis wmis pos ecoli rev '%s'", seq); |
---|
676 | } else { |
---|
677 | res = (STRPTR) " name fullname " |
---|
678 | "mis N_mis wmis pos ecoli rev"; |
---|
679 | } |
---|
680 | |
---|
681 | if (PTPanGlobalPtr->pg_verbose >0) printf("== GetMatchListHeader: %s\n", res); |
---|
682 | |
---|
683 | return(res); |
---|
684 | } |
---|
685 | /* \\\ */ |
---|
686 | |
---|
687 | /* /// "get_match_hinfo()" */ |
---|
688 | extern "C" STRPTR get_match_hinfo(PT_probematch *) |
---|
689 | { |
---|
690 | return(GetMatchListHeader(NULL)); |
---|
691 | } |
---|
692 | /* \\\ */ |
---|
693 | |
---|
694 | /* /// "c_get_match_hinfo()" */ |
---|
695 | extern "C" STRPTR c_get_match_hinfo(PT_probematch *) |
---|
696 | { |
---|
697 | printf("EXTERN: c_get_match_hinfo\n"); |
---|
698 | return(GetMatchListHeader(NULL)); |
---|
699 | } |
---|
700 | /* \\\ */ |
---|
701 | |
---|
702 | /* /// "match_string()" */ |
---|
703 | /* Create a big output string: header\001name\001info\001name\001info....\000 */ |
---|
704 | extern "C" bytestring * match_string(PT_local *locs) |
---|
705 | { |
---|
706 | struct PTPanGlobal *pg = PTPanGlobalPtr; |
---|
707 | struct GBS_strstruct *outstr; |
---|
708 | PT_probematch *ml; |
---|
709 | STRPTR srcptr; |
---|
710 | LONG entryCount = 0; |
---|
711 | |
---|
712 | printf("EXTERN: match_string\n"); |
---|
713 | free(pg->pg_ResultString.data); // free old memory |
---|
714 | for(ml = locs->pm; ml; ml = ml->next) // count number of entries |
---|
715 | ++entryCount; |
---|
716 | |
---|
717 | outstr = GBS_stropen(entryCount * 150); // 150 bytes per entry seemes to be a good estimation |
---|
718 | |
---|
719 | if(locs->pm) // add header |
---|
720 | { |
---|
721 | srcptr = GetMatchListHeader(locs->pm->reversed ? locs->pm_csequence : locs->pm_sequence); |
---|
722 | GBS_strcat(outstr, srcptr); |
---|
723 | GBS_chrcat(outstr, 1); |
---|
724 | } |
---|
725 | |
---|
726 | for(ml = locs->pm; ml; ml = ml->next) // add each entry to the list |
---|
727 | { |
---|
728 | srcptr = virt_name(ml); // add the name |
---|
729 | GBS_strcat(outstr, srcptr); |
---|
730 | GBS_chrcat(outstr, 1); |
---|
731 | |
---|
732 | srcptr = get_match_info(ml); // and the info |
---|
733 | GBS_strcat(outstr, srcptr); |
---|
734 | GBS_chrcat(outstr, 1); |
---|
735 | } |
---|
736 | |
---|
737 | pg->pg_ResultString.data = GBS_strclose(outstr); |
---|
738 | pg->pg_ResultString.size = strlen(pg->pg_ResultString.data) + 1; |
---|
739 | |
---|
740 | if (PTPanGlobalPtr->pg_verbose >0) printf("== match_string: %s\n", pg->pg_ResultString.data); |
---|
741 | |
---|
742 | #ifdef DEBUG |
---|
743 | printf("%li entries used %li bytes (%li MB) of buffer: %5.2f byte per entry\n", |
---|
744 | entryCount, pg->pg_ResultString.size, pg->pg_ResultString.size >> 20, |
---|
745 | (double)pg->pg_ResultString.size/(double)entryCount); |
---|
746 | #endif |
---|
747 | return(&pg->pg_ResultString); |
---|
748 | } |
---|
749 | /* \\\ */ |
---|
750 | |
---|
751 | /* /// "MP_match_string()" */ |
---|
752 | /* Create a big output string: header\001name\001#mismatch\001name\001#mismatch....\000 */ |
---|
753 | extern "C" bytestring * MP_match_string(PT_local *locs) |
---|
754 | { |
---|
755 | struct PTPanGlobal *pg = PTPanGlobalPtr; |
---|
756 | PT_probematch *ml; |
---|
757 | STRPTR outptr; |
---|
758 | STRPTR srcptr; |
---|
759 | LONG buflen = 100000000; // TODO: calculate buflen instead of using hard coded value |
---|
760 | |
---|
761 | printf("EXTERN: MP_match_string\n"); |
---|
762 | /* free old memory */ |
---|
763 | free(pg->pg_ResultMString.data); |
---|
764 | |
---|
765 | outptr = (STRPTR) malloc(buflen); |
---|
766 | pg->pg_ResultMString.data = outptr; |
---|
767 | |
---|
768 | buflen--; /* space for termination byte */ |
---|
769 | |
---|
770 | LONG entryCount = 0; |
---|
771 | /* add each entry to the list */ |
---|
772 | for(ml = locs->pm; ml; ml = ml->next) |
---|
773 | { |
---|
774 | ++entryCount; |
---|
775 | /* add the name */ |
---|
776 | srcptr = virt_name(ml); |
---|
777 | while((--buflen > 0) && (*outptr++ = *srcptr++)); |
---|
778 | if(buflen <= 0) |
---|
779 | { |
---|
780 | printf("ERROR: buffer too small - see function MP_match_string(...) in file PT_match.cxx\n"); |
---|
781 | break; |
---|
782 | } |
---|
783 | outptr[-1] = 1; |
---|
784 | |
---|
785 | /* and and the mismatch and wmismatch count */ |
---|
786 | sprintf(pg->pg_TempBuffer, "%2d\001%1.1f", ml->mismatches, ml->wmismatches); |
---|
787 | srcptr = pg->pg_TempBuffer; |
---|
788 | while((--buflen > 0) && (*outptr++ = *srcptr++)); |
---|
789 | if(buflen <= 0) |
---|
790 | { |
---|
791 | printf("ERROR: buffer too small - see function MP_match_string(...) in file PT_match.cxx\n"); |
---|
792 | break; |
---|
793 | } |
---|
794 | outptr[-1] = 1; |
---|
795 | } |
---|
796 | /* terminate string */ |
---|
797 | *outptr++ = 0; |
---|
798 | |
---|
799 | pg->pg_ResultMString.size = (ULONG) outptr - (ULONG) pg->pg_ResultMString.data; |
---|
800 | /* free unused memory */ |
---|
801 | pg->pg_ResultMString.data = (STRPTR) realloc(pg->pg_ResultMString.data, |
---|
802 | pg->pg_ResultMString.size); |
---|
803 | |
---|
804 | if (PTPanGlobalPtr->pg_verbose >0) printf("== MP_match_string: %s\n", pg->pg_ResultString.data); |
---|
805 | |
---|
806 | printf("%li entries used %li bytes (%li MB) of buffer: %5.2f byte per entry\n", |
---|
807 | entryCount, (100000000-buflen), (100000000-buflen) >> 20, (double)(100000000-buflen)/(double)entryCount); |
---|
808 | return(&pg->pg_ResultMString); |
---|
809 | } |
---|
810 | /* \\\ */ |
---|
811 | |
---|
812 | /* /// "MP_all_species_string()" */ |
---|
813 | /* Create a big output string: 001name\001name\....\000 */ |
---|
814 | extern "C" bytestring * MP_all_species_string(PT_local *) |
---|
815 | { |
---|
816 | struct PTPanGlobal *pg = PTPanGlobalPtr; |
---|
817 | struct PTPanSpecies *ps; |
---|
818 | STRPTR outptr; |
---|
819 | STRPTR srcptr; |
---|
820 | // LONG buflen = 500000; /* enough for about 50000 species */ |
---|
821 | LONG buflen = 100000000; // TODO: calculate buflen instead of using hard coded value |
---|
822 | |
---|
823 | printf("EXTERN: MP_all_species_string\n"); |
---|
824 | /* free old memory */ |
---|
825 | free(pg->pg_SpeciesString.data); |
---|
826 | |
---|
827 | outptr = (STRPTR) malloc(buflen); |
---|
828 | pg->pg_SpeciesString.data = outptr; |
---|
829 | |
---|
830 | buflen--; /* space for termination byte */ |
---|
831 | |
---|
832 | LONG entryCount = 0; |
---|
833 | /* add each entry to the list */ |
---|
834 | ps = (struct PTPanSpecies *) pg->pg_Species.lh_Head; |
---|
835 | while(ps->ps_Node.ln_Succ) |
---|
836 | { |
---|
837 | ++entryCount; |
---|
838 | /* add the name */ |
---|
839 | srcptr = ps->ps_Name; |
---|
840 | while((--buflen > 0) && (*outptr++ = *srcptr++)); |
---|
841 | if(buflen <= 0) |
---|
842 | { |
---|
843 | printf("ERROR: buffer too small - see function MP_all_species_string(...) in file PT_match.cxx\n"); |
---|
844 | break; |
---|
845 | } |
---|
846 | outptr[-1] = 1; |
---|
847 | ps = (struct PTPanSpecies *) ps->ps_Node.ln_Succ; |
---|
848 | } |
---|
849 | /* terminate string */ |
---|
850 | *outptr++ = 0; |
---|
851 | |
---|
852 | pg->pg_SpeciesString.size = (ULONG) outptr - (ULONG) pg->pg_SpeciesString.data; |
---|
853 | /* free unused memory */ |
---|
854 | pg->pg_SpeciesString.data = (STRPTR) realloc(pg->pg_SpeciesString.data, |
---|
855 | pg->pg_SpeciesString.size); |
---|
856 | printf("%li entries used %li bytes (%li MB) of buffer: %5.2f byte per entry\n", |
---|
857 | entryCount, (100000000-buflen), (100000000-buflen) >> 20, (double)(100000000-buflen)/(double)entryCount); |
---|
858 | return(&pg->pg_SpeciesString); |
---|
859 | } |
---|
860 | /* \\\ */ |
---|
861 | |
---|
862 | /* /// "MP_count_all_species()" */ |
---|
863 | extern "C" int MP_count_all_species(PT_local *) |
---|
864 | { |
---|
865 | struct PTPanGlobal *pg = PTPanGlobalPtr; |
---|
866 | printf("EXTERN: MP_count_all_species\n"); |
---|
867 | return(pg->pg_NumSpecies); |
---|
868 | } |
---|
869 | /* \\\ */ |
---|
870 | |
---|
871 | |
---|