source: branches/port5/ptpan/PTP_match.cxx

Last change on this file was 5908, checked in by westram, 16 years ago
  • source files with identical names are really a pain when using valgrind
File size: 27.3 KB
Line 
1
2#include <stdio.h>
3#include <stdlib.h>
4#include <string.h>
5#include <PT_server.h>
6#include <PT_server_prototypes.h>
7#include <struct_man.h>
8#include "ptpan.h"
9#include "pt_prototypes.h"
10#include <arbdbt.h>
11#include <math.h>
12
13/* /// "SearchPartition()" */
14void SearchPartition(struct PTPanPartition *pp, struct SearchQuery *sq)
15{
16  struct PTPanGlobal *pg = pp->pp_PTPanGlobal;
17  struct SearchQuery *tmpsq;
18
19  /* do search on this partition */
20  tmpsq = CloneSearchQuery(sq);
21  tmpsq->sq_PTPanPartition = pp;
22  tmpsq->sq_SourceSeq = pp->pp_PrefixSeq;
23
24  if (PTPanGlobalPtr->pg_verbose >0)
25    printf("== SearchPartition: for %s\n", sq->sq_Query);
26
27  if(MatchSequence(tmpsq))
28  {
29    if(!(pp->pp_CacheNode = CacheLoadData(pg->pg_PartitionCache, pp->pp_CacheNode, pp)))
30    {
31      return; /* something went wrong while loading */
32    }
33    SearchTree(tmpsq);
34    PostFilterQueryHits(tmpsq);
35    MergeQueryHits(sq, tmpsq); /* needs semaphore protection on parallel runs */
36  }
37  pp->pp_Done = TRUE;
38  FreeSearchQuery(tmpsq);
39}
40/* \\\ */
41
42#ifdef BENCHMARK
43/* /// "QueryTests()" */
44void QueryTests(struct PTPanGlobal *pg)
45{
46  PT_local *locs;
47  STRPTR ecoli;
48  ULONG ecolilen;
49  ULONG pos;
50  ULONG qlen;
51  char buf[32];
52
53  locs = (PT_local *) calloc(1, sizeof(PT_local));
54  ecoli = FilterSequence(pg, pg->pg_EcoliSeq);
55  ecolilen = strlen(ecoli);
56  locs->pm_max = 0; /* exact search */
57  locs->pm_complement = 0;
58  locs->pm_reversed = 0;
59  locs->sort_by = SORT_HITS_WEIGHTED;
60
61  //qlen = 18;
62  for(locs->pm_max = 4; locs->pm_max < 5; locs->pm_max++)
63  {
64//    for(qlen = 20; qlen - locs->pm_max >= 10; qlen--)
65    for(qlen = 9 + locs->pm_max; qlen >= 10; qlen--)
66//    for(qlen = 31; qlen >= 16; qlen--)
67    {
68      pg->pg_Bench.ts_Hits = 0;
69      pg->pg_Bench.ts_UnsafeHits = 0;
70      pg->pg_Bench.ts_UnsafeKilled = 0;
71      pg->pg_Bench.ts_DupsKilled = 0;
72      pg->pg_Bench.ts_CrossBoundKilled = 0;
73      pg->pg_Bench.ts_DotsKilled = 0;
74      pg->pg_Bench.ts_OutHits = 0;
75      pg->pg_Bench.ts_CandSetTime= 0;
76      pg->pg_Bench.ts_OutputTime = 0;
77
78      for(pos = 0; pos < ecolilen - qlen; pos += 2)
79      {
80       strncpy(buf, &ecoli[pos], qlen);
81       buf[qlen] = 0;
82       probe_match(locs, strdup(buf));
83      }
84      printf("qDAT: (queries qlen err hits gentime unsafe unkill dupskill crosskill dotskill outhits outtime)\n");
85      printf("%ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %s QDAT\n",
86             pos, qlen, locs->pm_max,
87             pg->pg_Bench.ts_Hits,
88             pg->pg_Bench.ts_CandSetTime,
89             pg->pg_Bench.ts_UnsafeHits,
90             pg->pg_Bench.ts_UnsafeKilled,
91             pg->pg_Bench.ts_DupsKilled,
92             pg->pg_Bench.ts_CrossBoundKilled,
93        pg->pg_Bench.ts_DotsKilled,
94        pg->pg_Bench.ts_OutHits,
95        pg->pg_Bench.ts_OutputTime,
96        pg->pg_DBName);
97      fflush(stdout);
98    }
99  }
100  printf("Done\n");
101  free(ecoli);
102  free(locs);
103}
104/* \\\ */
105#endif
106
107
108void PP_convertBondMatrix(PT_pdc *pdc, PTPanGlobal *pg)
109{
110    for (int query = SEQCODE_A; query <= SEQCODE_T; ++query) {
111        for (int species = SEQCODE_A; species <= SEQCODE_T; ++species) {
112            int rowIdx     = (pg->pg_ComplementTable[query] - SEQCODE_A)*4;
113            int maxIdx     = rowIdx + query - SEQCODE_A;
114            int newIdx     = rowIdx + species - SEQCODE_A;
115           
116            double max_bind = pdc->bond[maxIdx].val;
117            double new_bind = pdc->bond[newIdx].val;
118           
119            pg->pg_MismatchWeights.mw_Replace[query * ALPHASIZE + species] = max_bind - new_bind;
120        }
121    }
122#if defined(DEBUG)
123    printf("Current bond values:\n");
124    for (int y = 0; y<4; y++) {
125        for (int x = 0; x<4; x++) {
126            printf("%5.2f", pdc->bond[y*4+x].val);
127        }
128        printf("\n");
129    }
130    printf("Current Replace Matrix:\n");
131    for (int query = SEQCODE_A; query <= SEQCODE_T; ++query) {
132        for (int species = SEQCODE_A; species <= SEQCODE_T; ++species) {
133            printf("%5.2f", pg->pg_MismatchWeights.mw_Replace[query * ALPHASIZE + species]);
134        }
135        printf("\n");
136    }
137#endif // DEBUG
138}
139
140
141static double PP_calc_position_wmis(int pos, int seq_len, double y1, double y2)
142{
143    return (double)(((double)(pos * (seq_len - 1 - pos)) / (double)((seq_len - 1) * (seq_len - 1)))* (double)(y2*4.0) + y1);
144}
145
146
147void PP_buildPosWeight(SearchQuery *sq)
148{
149    if (sq->sq_PosWeight) delete[] sq->sq_PosWeight;
150    //printf("buildPosWeight: ...new double[%i];\n", sq->sq_QueryLen+1);
151    sq->sq_PosWeight = new double[sq->sq_QueryLen+1];       // TODO: check if +1 is necessary
152
153    for (int pos=0; pos < sq->sq_QueryLen; ++pos) {
154        if (sq->sq_SortMode == SORT_HITS_WEIGHTED) {
155            sq->sq_PosWeight[pos] = PP_calc_position_wmis(pos, sq->sq_QueryLen, 0.3, 1.0);
156        }else{
157            sq->sq_PosWeight[pos] = 1.0;
158        }
159    }
160    sq->sq_PosWeight[sq->sq_QueryLen] = 0.0;                // TODO: check if last pos is necessary
161#if defined(DEBUG)
162    printf("sq_Posweight[]: ");
163    for (int pos=0; pos < sq->sq_QueryLen; ++pos) {
164        printf("%f, ", sq->sq_PosWeight[pos]);
165    }
166    printf("%f\n", sq->sq_PosWeight[sq->sq_QueryLen]);
167#endif
168}
169
170
171/* /// "probe_match()" */
172extern "C" int probe_match(PT_local *locs, aisc_string probestring)
173{
174  struct PTPanGlobal *pg = PTPanGlobalPtr;
175  struct PTPanPartition *pp;
176  struct SearchQuery *sq;
177  struct SearchQuery *compsq = NULL;
178  PT_probematch *ml;
179
180  pg->pg_SearchPrefs = locs;
181
182  PP_convertBondMatrix(locs->pdc, pg);
183
184  /* find out where a given probe matches */
185  if (PTPanGlobalPtr->pg_verbose >0) {
186    printf("Search request for %s (errs = %d, compl = %d, rev = %d, weight = %d)\n",
187           probestring,
188           pg->pg_SearchPrefs->pm_max,
189           pg->pg_SearchPrefs->pm_complement,
190           pg->pg_SearchPrefs->pm_reversed,
191           pg->pg_SearchPrefs->sort_by);
192  }
193
194  /* free the old sequence */
195  if(pg->pg_SearchPrefs->pm_sequence)
196  {
197    free(pg->pg_SearchPrefs->pm_sequence);
198  }
199  pg->pg_SearchPrefs->pm_sequence = FilterSequence(pg, probestring);
200
201  /* do we need to check the complement instead of the normal one? */
202  if(pg->pg_SearchPrefs->pm_complement)
203  {
204    ComplementSequence(pg, pg->pg_SearchPrefs->pm_sequence);
205  }
206
207  /* do we need to look at the reversed sequence as well? */
208  if(pg->pg_SearchPrefs->pm_reversed)
209  {
210    if(pg->pg_SearchPrefs->pm_csequence)
211    {
212      free(pg->pg_SearchPrefs->pm_csequence);
213    }
214    pg->pg_SearchPrefs->pm_csequence = strdup(pg->pg_SearchPrefs->pm_sequence);
215    ReverseSequence(pg, pg->pg_SearchPrefs->pm_csequence);
216    ComplementSequence(pg, pg->pg_SearchPrefs->pm_csequence);
217  }
218
219  //psg.main_probe = strdup(probestring);
220
221  /* clear all old matches */
222  while((ml = pg->pg_SearchPrefs->pm))
223  {
224    destroy_PT_probematch(ml);
225  }
226
227#if 1
228  /* check, if the probe string is too short */
229  if(strlen(pg->pg_SearchPrefs->pm_sequence) +
230     (2 * pg->pg_SearchPrefs->pm_max) < MIN_PROBE_LENGTH)
231  {
232    SetARBErrorMsg(pg->pg_SearchPrefs, (STRPTR) "error: probe too short!!\n");
233    free(probestring);
234    return(0);
235  }
236#endif
237
238  /* allocate query that configures and holds all the merged results */
239  sq = AllocSearchQuery(pg);
240
241  /* prefs */
242  sq->sq_Query = (STRPTR) pg->pg_SearchPrefs->pm_sequence;
243  sq->sq_QueryLen = strlen(sq->sq_Query);
244  sq->sq_MaxErrors = (float) pg->pg_SearchPrefs->pm_max;
245  sq->sq_Reversed = FALSE;
246  sq->sq_AllowReplace = TRUE;
247  sq->sq_AllowInsert = TRUE;
248  sq->sq_AllowDelete = TRUE;
249  sq->sq_KillNSeqsAt = strlen(sq->sq_Query) / 3;
250  sq->sq_MinorMisThres = pg->pg_SearchPrefs->pdc->split;
251  sq->sq_SortMode = pg->pg_SearchPrefs->sort_by;
252
253  /* init */
254  sq->sq_PTPanPartition = NULL;
255  PP_buildPosWeight(sq);
256  if(pg->pg_SearchPrefs->sort_by)
257  {
258    /* user requested weighted searching */
259    sq->sq_MismatchWeights = &sq->sq_PTPanGlobal->pg_MismatchWeights;
260  } else {
261    /* user wants unified searching */
262    sq->sq_MismatchWeights = &sq->sq_PTPanGlobal->pg_NoWeights;
263  }
264
265  /* do we need to do a second query on the complement? */
266  if(pg->pg_SearchPrefs->pm_reversed)
267  {
268    compsq = CloneSearchQuery(sq);
269    compsq->sq_Query = (STRPTR) pg->pg_SearchPrefs->pm_csequence;
270    compsq->sq_Reversed = TRUE;
271  }
272
273  /* start time here */
274#ifdef BENCHMARK
275  if (PTPanGlobalPtr->pg_verbose >0)
276    BenchTimePassed(pg);
277#endif
278
279  /* search over partitions that are still in cache */
280  pp = (struct PTPanPartition *) pg->pg_Partitions.lh_Head;
281  while(pp->pp_Node.ln_Succ)
282  {
283    pp->pp_Done = FALSE;
284    if(CacheDataLoaded(pp->pp_CacheNode))
285    {
286      /* search normal */
287      SearchPartition(pp, sq);
288      /* and optionally, search complement */
289      if(compsq)
290      {
291    SearchPartition(pp, compsq);
292      }
293    }
294    pp = (struct PTPanPartition *) pp->pp_Node.ln_Succ;
295  }
296
297  /* search over all partitions not done yet */
298  pp = (struct PTPanPartition *) pg->pg_Partitions.lh_Head;
299  while(pp->pp_Node.ln_Succ)
300  {
301    if(!pp->pp_Done)
302    {
303      /* search normal */
304      SearchPartition(pp, sq);
305      /* and optionally, search complement */
306      if(compsq)
307      {
308    SearchPartition(pp, compsq);
309      }
310    }
311    pp = (struct PTPanPartition *) pp->pp_Node.ln_Succ;
312  }
313
314#ifdef BENCHMARK
315  if (PTPanGlobalPtr->pg_verbose >0)
316    pg->pg_Bench.ts_CandSetTime += BenchTimePassed(pg);
317#endif
318
319  SortHitsList(sq);
320  CreateHitsGUIList(sq);
321  FreeSearchQuery(sq);
322  if(compsq)
323  {
324    SortHitsList(compsq);
325    CreateHitsGUIList(compsq);
326    FreeSearchQuery(compsq);
327  }
328
329#ifdef BENCHMARK
330  if (PTPanGlobalPtr->pg_verbose >0)
331    pg->pg_Bench.ts_OutputTime += BenchTimePassed(pg);
332#endif
333
334  free(probestring); /* I actually don't know, if this is required */
335  return 0;
336}
337/* \\\ */
338
339/* /// "SortHitsList()" */
340void SortHitsList(struct SearchQuery *sq)
341{
342  //struct PTPanGlobal *pg = sq->sq_PTPanGlobal;
343  struct QueryHit *qh;
344
345  /* enter priority and sort */
346  qh = (struct QueryHit *) sq->sq_Hits.lh_Head;
347  if(sq->sq_SortMode == SORT_HITS_NOWEIGHT)
348  {
349    /* sorting criteria:
350       - normal/composite (1 bit)
351       - replace only or insert/delete (1 bit)
352       - mismatch count (5 bits)
353       - error count (8 bits)
354       - species (20 bits)
355       - absolute position (28 bits)
356    */
357    //printf("Sort no weight...\n");
358    while(qh->qh_Node.ln_Succ)
359    {
360      arb_assert(((LLONG) (qh->qh_ReplaceCount + qh->qh_InsertCount + qh->qh_DeleteCount)) <= 0x1f);  // 5 bit
361      arb_assert(((LLONG) round(qh->qh_ErrorCount * 10.0)) <= 0xff);                        //  8 bit
362      arb_assert(((LLONG) qh->qh_Species->ps_Num) <= 0xfffff);                              // 20 bit
363      arb_assert(((LLONG) (qh->qh_AbsPos - qh->qh_Species->ps_AbsOffset)) <= 0xfffffff);    // 28 bit
364      qh->qh_Node.ln_Pri = (LLONG)
365    ((qh->qh_Flags & QHF_REVERSED) ? (1LL << 62) : 0LL) +
366        ((qh->qh_InsertCount | qh->qh_DeleteCount) ? (1LL << 61) : 0LL) +
367    (((LLONG) (qh->qh_ReplaceCount + qh->qh_InsertCount + qh->qh_DeleteCount)) << 56) +
368    (((LLONG) round(qh->qh_ErrorCount * 10.0)) << 48) +
369    (((LLONG) qh->qh_Species->ps_Num) << 28) +
370    ((LLONG) (qh->qh_AbsPos - qh->qh_Species->ps_AbsOffset));
371      qh = (struct QueryHit *) qh->qh_Node.ln_Succ;
372    }
373  } else {
374    //printf("Sort with weight...\n");
375    while(qh->qh_Node.ln_Succ)
376    {
377      arb_assert(((LLONG) (qh->qh_ReplaceCount + qh->qh_InsertCount + qh->qh_DeleteCount)) <= 0x1f);  // 5 bit
378      arb_assert(((LLONG) round(qh->qh_ErrorCount * 10.0)) <= 0xff);                        //  8 bit
379      arb_assert(((LLONG) qh->qh_Species->ps_Num) <= 0xfffff);                              // 20 bit
380      arb_assert(((LLONG) (qh->qh_AbsPos - qh->qh_Species->ps_AbsOffset)) <= 0xfffffff);    // 28 bit
381      qh->qh_Node.ln_Pri = (LLONG)
382    ((LLONG) (qh->qh_Flags & QHF_REVERSED) ? (1LL << 62) : 0LL) +
383        ((LLONG) (qh->qh_InsertCount | qh->qh_DeleteCount) ? (1LL << 61) : 0LL) +
384    (((LLONG) round(qh->qh_ErrorCount * 10.0)) << 53) +
385    (((LLONG) (qh->qh_ReplaceCount + qh->qh_InsertCount + qh->qh_DeleteCount)) << 48) +
386    ((LLONG) qh->qh_Species->ps_Num << 28) +
387    ((LLONG) (qh->qh_AbsPos - qh->qh_Species->ps_AbsOffset));
388      //printf("%16llx\n", qh->qh_Node.ln_Pri);
389      qh = (struct QueryHit *) qh->qh_Node.ln_Succ;
390    }
391  }
392  SortList(&sq->sq_Hits);
393#if 0
394  qh = (struct QueryHit *) sq->sq_Hits.lh_Head;
395  //printf("... and after\n");
396  while(qh->qh_Node.ln_Succ)
397  {
398    //printf("%16llx\n", qh->qh_Node.ln_Pri);
399    qh = (struct QueryHit *) qh->qh_Node.ln_Succ;
400  }
401#endif
402}
403/* \\\ */
404
405
406/* /// "CreateHitsGUIList()" */
407void CreateHitsGUIList(struct SearchQuery *sq)
408{
409  struct PTPanGlobal *pg = sq->sq_PTPanGlobal;
410  struct PTPanSpecies *ps;
411  struct QueryHit *qh;
412  STRPTR srcptr;
413  STRPTR tarptr;
414  ULONG maxlen;
415  float minweight;
416  ULONG cnt;
417  ULONG numhits;
418  ULONG tarlen;
419
420  if (PTPanGlobalPtr->pg_verbose >0) printf(">> CreateHitsGUIList\n");
421   
422  /* calculate maximum size of string that we have to examine */
423  minweight = sq->sq_MismatchWeights->mw_Delete[0];
424  for(cnt = 1; cnt < pg->pg_AlphaSize; cnt++)
425  {
426    if(sq->sq_MismatchWeights->mw_Delete[cnt] < minweight)
427    {
428      minweight = sq->sq_MismatchWeights->mw_Delete[cnt];
429    }
430  }
431  maxlen = sq->sq_QueryLen + (ULONG) ((sq->sq_MaxErrors + minweight) / minweight);
432  sq->sq_SourceSeq = (STRPTR) malloc(maxlen + 1);
433
434  numhits = 0;
435  pg->pg_SpeciesCache->ch_SwapCount = 0;
436
437  qh = (struct QueryHit *) sq->sq_Hits.lh_Head;
438  while(qh->qh_Node.ln_Succ)
439  {
440    LONG relpos;
441    BOOL good;
442    ULONG nmismatch;
443    UBYTE code;
444    UBYTE seqcode;
445
446    STRPTR seqout;
447    STRPTR seqptr;
448    LONG   relposcnt;
449    PT_probematch *ml;
450
451    good = TRUE;
452    ps = qh->qh_Species;
453    char prefix[10], postfix[10];
454    for (int i = 0; i < 9; ++i)
455    {
456        prefix[i] = '>';
457        postfix[i] = '<';
458    }
459    prefix[9] = postfix[9] = 0x00;
460
461    relpos    = 0;
462    nmismatch = 0;
463    ULONG abspos = qh->qh_AbsPos - ps->ps_AbsOffset;
464    ULONG bitpos = 0;
465    ULONG count;
466    /* given an absolute sequence position, search for the relative one,
467       e.g. abspos 2 on "-----UU-C-C" will yield 8
468            abspos:           01 2 3
469            relpos:      0123456789a     */
470/*           
471if (strcmp(ps->ps_Name, "BclSp114") == 0)
472{
473    printf("qh_AbsPos: %li\t\tps_AbsOffset: %li\t\tabspos:%li\n",
474           qh->qh_AbsPos, ps->ps_AbsOffset, abspos);
475    while ((code = GetNextCharacter(pg, ps->ps_SeqDataCompressed, bitpos, count)) != 0xff)
476    {
477        if (count > 1) printf("%li%c", count, code);
478                  else printf("%c", code);
479    }
480    printf("\n");
481    bitpos = 0;
482}
483*/
484    while (bitpos < ps->ps_SeqDataCompressedSize)           // get relpos and store prefix
485    {
486        code = GetNextCharacter(pg, ps->ps_SeqDataCompressed, bitpos, count);
487
488        if (pg->pg_SeqCodeValidTable[code])
489        {                                                       // it's a valid char
490            if (!(abspos--)) break;                             // position found
491            if (abspos <= 8) prefix[8-abspos] = code;           // store prefix
492            ++relpos;
493        } else
494        {                                                       // it's not a valid char
495            arb_assert((code == '.') || (code == '-'));
496           
497#ifdef ALLOWDOTSINMATCH
498            if ((code == '.') && (count == 1))                  // check for dots in match
499            {
500                if (!(abspos--)) break;                         // position found
501                if (abspos <= 8) prefix[8-abspos] = code;       // store prefix
502            }
503#endif
504            relpos += count;
505            if ((code == '.') && (abspos <= 9))                 // fill prefix with '.'
506            {                                                   // TODO: decide if we really want to fill the
507                for (int i = 0; i < (9 - abspos); ++i)          //       whole prefix or just 'count' dots
508                {
509                    prefix[i] = '.';
510                }
511            }
512        }
513    }
514    arb_assert(bitpos < ps->ps_SeqDataCompressedSize);
515    bitpos -= 3;      // bitpos now points to the first character of found seq
516//printf("%c, %li, %li, %li\n", code, ((unsigned long)code), count, bitpos);   
517    tarlen = sq->sq_QueryLen - qh->qh_DeleteCount + qh->qh_InsertCount;
518    for (cnt = 0; cnt < tarlen;)                                // copy found string into sq->sq_SourceSeq[]
519    {
520        if (bitpos >= ps->ps_SeqDataCompressedSize)
521        {
522            arb_assert(false);
523            good = FALSE;
524            break;
525        }
526
527        code = GetNextCharacter(pg, ps->ps_SeqDataCompressed, bitpos, count);
528        if (pg->pg_SeqCodeValidTable[code])                 // valid character
529        {
530            sq->sq_SourceSeq[cnt++] = code;
531            if (code == 'N') nmismatch++;
532        } else
533        {
534            if (code == '.')                                // if we got a dot in sequence
535            {                                               // the hit is bogus
536#ifdef ALLOWDOTSINMATCH
537                if (count == 1)
538                {
539                    sq->sq_SourceSeq[cnt++] = '.';
540                } else
541#endif               
542                {
543                    pg->pg_Bench.ts_DotsKilled++;
544                    good = FALSE;
545                    break;
546                }
547            }
548        }
549    }
550    sq->sq_SourceSeq[tarlen] = 0;
551    if(nmismatch == tarlen) good = FALSE;
552
553    if(good)
554    {
555        /* we need to verify the hit? */
556        if(qh->qh_Flags & QHF_UNSAFE)
557        {
558            pg->pg_Bench.ts_UnsafeHits++;
559
560            good = MatchSequence(sq);
561            if(!good)
562            {
563                pg->pg_Bench.ts_UnsafeKilled++;
564                //printf("Verify failed on %s != %s\n", sq->sq_SourceSeq, sq->sq_Query);
565                qh->qh_Flags &= ~QHF_ISVALID;
566            } else {
567                /* fill in correct match */
568                qh->qh_ErrorCount   = sq->sq_State.sqs_ErrorCount;
569                qh->qh_ReplaceCount = sq->sq_State.sqs_ReplaceCount;
570                qh->qh_InsertCount  = sq->sq_State.sqs_InsertCount;
571                qh->qh_DeleteCount  = sq->sq_State.sqs_DeleteCount;
572            }
573            //qh->qh_Flags &= ~QHF_UNSAFE;
574        }
575    } //else printf("'.'-Sequence!\n");
576
577    if(good)
578    {
579        seqout = (STRPTR) calloc(9 + 1 + sq->sq_QueryLen + 1 + 9 + 1, 0x01);
580        strncpy(seqout, prefix, 0x09);                          // copy prefix
581        seqout[9] = '-';                                        // 1st delimiter
582        good = FindSequenceMatch(sq, qh, &seqout[10]);          // generate mismatch sequence */
583        seqout[10 + sq->sq_QueryLen] = '-';                     // 2nd delimiter
584        if (!good) free(seqout);
585    }
586
587    if (good)
588    {
589        for (cnt = 0; cnt < 9;)                                 // generate postfix
590        {
591            code = GetNextCharacter(pg, ps->ps_SeqDataCompressed, bitpos, count);
592            if (code == 0xff) break;
593            if (pg->pg_SeqCodeValidTable[code])                 // valid character
594            { 
595                postfix[cnt++] = code;
596            } else if (code == '.')                             // '.' found
597            {
598                for (; cnt < 9; ++cnt)                          // fill postfix with '.'
599                {                                               // TODO: decide if we really want to fill the
600                    postfix[cnt] = '.';                         //       whole postfix or just 'count' dots
601                }
602            }
603        }
604
605        strncpy(&seqout[11 + sq->sq_QueryLen], postfix, 0x09);  // copy postfix
606
607        ml = create_PT_probematch();
608        ml->name = qh->qh_Species->ps_Num;
609        ml->b_pos = relpos;
610        ml->rpos = qh->qh_AbsPos - ps->ps_AbsOffset;
611        ml->wmismatches = (double) qh->qh_ErrorCount;
612        ml->mismatches = qh->qh_ReplaceCount + qh->qh_InsertCount + qh->qh_DeleteCount;
613        ml->N_mismatches = nmismatch;
614        ml->sequence = seqout; /* warning! potentional memory leak -- FIX destroy_PT_probematch(ml) */
615        ml->reversed = (qh->qh_Flags & QHF_REVERSED) ? 1 : 0;
616
617        aisc_link((struct_dllpublic_ext *) &(pg->pg_SearchPrefs->ppm), (struct_dllheader_ext *) ml);
618        numhits++;
619
620        if (PTPanGlobalPtr->pg_verbose >0) printf("SeqOut: '%s'\n", seqout);
621    }
622
623    RemQueryHit(qh);
624    qh = (struct QueryHit *) sq->sq_Hits.lh_Head;
625
626  } // while(qh->qh_Node.ln_Succ)
627  free(sq->sq_SourceSeq);
628
629  if (PTPanGlobalPtr->pg_verbose >0) {
630    pg->pg_Bench.ts_OutHits += numhits;
631    printf("<< CreateHitsGUIList: Number of hits %ld (SwapCount %ld)\n",
632    numhits, pg->pg_SpeciesCache->ch_SwapCount);
633  }
634}
635/* \\\ */
636
637/* /// "get_match_info()" */
638extern "C" STRPTR get_match_info(PT_probematch *ml)
639{
640  struct PTPanGlobal *pg = PTPanGlobalPtr;
641  struct PTPanSpecies *ps;
642  ULONG ecolipos = 0;
643
644  /* calculate ecoli position in O(1) */
645  if(pg->pg_EcoliBaseTable)
646  {
647    if((ULONG) ml->b_pos < pg->pg_EcoliSeqSize)
648    {
649      ecolipos = pg->pg_EcoliBaseTable[ml->b_pos];
650    } else {
651      ecolipos = pg->pg_EcoliBaseTable[pg->pg_EcoliSeqSize];
652    }
653  }
654  ps = pg->pg_SpeciesMap[ml->name];
655  sprintf(pg->pg_TempBuffer, "%10s %-30.30s %2d  %2d    %1.1f %7d %4ld  %1d  %s",
656    ps->ps_Name, ps->ps_FullName,
657    ml->mismatches, ml->N_mismatches, ml->wmismatches,
658    ml->b_pos, ecolipos,
659    ml->reversed, ml->sequence);
660
661  if (PTPanGlobalPtr->pg_verbose >0) printf("== get_match_info: %s\n", pg->pg_TempBuffer);
662
663  return(pg->pg_TempBuffer);
664}
665/* \\\ */
666
667/* /// "GetMatchListHeader()" */
668STRPTR GetMatchListHeader(STRPTR seq)
669{
670  STRPTR res;
671
672  if(seq)
673  {
674    res = (STRPTR) GBS_global_string("   name      fullname                       "
675                    "mis N_mis wmis    pos ecoli rev         '%s'", seq);
676  } else {
677    res = (STRPTR) "   name      fullname                       "
678      "mis N_mis wmis    pos ecoli rev";
679  }
680
681  if (PTPanGlobalPtr->pg_verbose >0) printf("== GetMatchListHeader: %s\n", res);
682
683  return(res);
684}
685/* \\\ */
686
687/* /// "get_match_hinfo()" */
688extern "C" STRPTR get_match_hinfo(PT_probematch *)
689{
690  return(GetMatchListHeader(NULL));
691}
692/* \\\ */
693
694/* /// "c_get_match_hinfo()" */
695extern "C" STRPTR c_get_match_hinfo(PT_probematch *)
696{
697  printf("EXTERN: c_get_match_hinfo\n");
698  return(GetMatchListHeader(NULL));
699}
700/* \\\ */
701
702/* /// "match_string()" */
703/* Create a big output string:  header\001name\001info\001name\001info....\000 */
704extern "C" bytestring * match_string(PT_local *locs)
705{
706    struct PTPanGlobal *pg = PTPanGlobalPtr;
707    struct GBS_strstruct *outstr;
708    PT_probematch *ml;
709    STRPTR srcptr;
710    LONG entryCount = 0;
711 
712    printf("EXTERN: match_string\n");
713    free(pg->pg_ResultString.data);             // free old memory
714    for(ml = locs->pm; ml; ml = ml->next)       // count number of entries
715        ++entryCount;
716   
717    outstr = GBS_stropen(entryCount * 150);     // 150 bytes per entry seemes to be a good estimation
718
719    if(locs->pm)                                // add header
720    {
721        srcptr = GetMatchListHeader(locs->pm->reversed ? locs->pm_csequence : locs->pm_sequence);
722        GBS_strcat(outstr, srcptr);
723        GBS_chrcat(outstr, 1);
724    }
725       
726    for(ml = locs->pm; ml; ml = ml->next)       // add each entry to the list
727    {
728        srcptr = virt_name(ml);                 // add the name
729        GBS_strcat(outstr, srcptr);
730        GBS_chrcat(outstr, 1);
731
732        srcptr = get_match_info(ml);            // and the info
733        GBS_strcat(outstr, srcptr);
734        GBS_chrcat(outstr, 1);
735    }
736
737    pg->pg_ResultString.data = GBS_strclose(outstr); 
738    pg->pg_ResultString.size = strlen(pg->pg_ResultString.data) + 1;
739
740    if (PTPanGlobalPtr->pg_verbose >0) printf("== match_string: %s\n", pg->pg_ResultString.data);
741
742#ifdef DEBUG
743    printf("%li entries used %li bytes (%li MB) of buffer: %5.2f byte per entry\n", 
744           entryCount, pg->pg_ResultString.size, pg->pg_ResultString.size >> 20, 
745           (double)pg->pg_ResultString.size/(double)entryCount);
746#endif
747  return(&pg->pg_ResultString);
748}
749/* \\\ */
750
751/* /// "MP_match_string()" */
752/* Create a big output string:  header\001name\001#mismatch\001name\001#mismatch....\000 */
753extern "C" bytestring * MP_match_string(PT_local *locs)
754{
755  struct PTPanGlobal *pg = PTPanGlobalPtr;
756  PT_probematch *ml;
757  STRPTR outptr;
758  STRPTR srcptr;
759  LONG buflen = 100000000;                 // TODO: calculate buflen instead of using hard coded value
760
761  printf("EXTERN: MP_match_string\n");
762  /* free old memory */
763  free(pg->pg_ResultMString.data);
764
765  outptr = (STRPTR) malloc(buflen);
766  pg->pg_ResultMString.data = outptr;
767
768  buflen--; /* space for termination byte */
769
770  LONG entryCount = 0;
771  /* add each entry to the list */
772  for(ml = locs->pm; ml; ml = ml->next)
773  {
774    ++entryCount;
775    /* add the name */
776    srcptr = virt_name(ml);
777    while((--buflen > 0) && (*outptr++ = *srcptr++));
778    if(buflen <= 0)
779    {
780      printf("ERROR: buffer too small - see function MP_match_string(...) in file PT_match.cxx\n");
781      break;
782    }
783    outptr[-1] = 1;
784
785    /* and and the mismatch and wmismatch count */
786    sprintf(pg->pg_TempBuffer, "%2d\001%1.1f", ml->mismatches, ml->wmismatches);
787    srcptr = pg->pg_TempBuffer;
788    while((--buflen > 0) && (*outptr++ = *srcptr++));
789    if(buflen <= 0)
790    {
791      printf("ERROR: buffer too small - see function MP_match_string(...) in file PT_match.cxx\n");
792      break;
793    }
794    outptr[-1] = 1;
795  }
796  /* terminate string */
797  *outptr++ = 0;
798
799  pg->pg_ResultMString.size = (ULONG) outptr - (ULONG) pg->pg_ResultMString.data;
800  /* free unused memory */
801  pg->pg_ResultMString.data = (STRPTR) realloc(pg->pg_ResultMString.data,
802                    pg->pg_ResultMString.size);
803
804  if (PTPanGlobalPtr->pg_verbose >0) printf("== MP_match_string: %s\n", pg->pg_ResultString.data);
805
806printf("%li entries used %li bytes (%li MB) of buffer: %5.2f byte per entry\n", 
807        entryCount, (100000000-buflen), (100000000-buflen) >> 20, (double)(100000000-buflen)/(double)entryCount);
808  return(&pg->pg_ResultMString);
809}
810/* \\\ */
811
812/* /// "MP_all_species_string()" */
813/* Create a big output string: 001name\001name\....\000 */
814extern "C" bytestring * MP_all_species_string(PT_local *)
815{
816  struct PTPanGlobal *pg = PTPanGlobalPtr;
817  struct PTPanSpecies *ps;
818  STRPTR outptr;
819  STRPTR srcptr;
820//  LONG buflen = 500000; /* enough for about 50000 species */
821  LONG buflen = 100000000;                 // TODO: calculate buflen instead of using hard coded value
822
823  printf("EXTERN: MP_all_species_string\n");
824  /* free old memory */
825  free(pg->pg_SpeciesString.data);
826
827  outptr = (STRPTR) malloc(buflen);
828  pg->pg_SpeciesString.data = outptr;
829
830  buflen--; /* space for termination byte */
831
832  LONG entryCount = 0;
833  /* add each entry to the list */
834  ps = (struct PTPanSpecies *) pg->pg_Species.lh_Head;
835  while(ps->ps_Node.ln_Succ)
836  {
837    ++entryCount;
838    /* add the name */
839    srcptr = ps->ps_Name;
840    while((--buflen > 0) && (*outptr++ = *srcptr++));
841    if(buflen <= 0)
842    {
843      printf("ERROR: buffer too small - see function MP_all_species_string(...) in file PT_match.cxx\n");
844      break;
845    }
846    outptr[-1] = 1;
847    ps = (struct PTPanSpecies *) ps->ps_Node.ln_Succ;
848  }
849  /* terminate string */
850  *outptr++ = 0;
851
852  pg->pg_SpeciesString.size = (ULONG) outptr - (ULONG) pg->pg_SpeciesString.data;
853  /* free unused memory */
854  pg->pg_SpeciesString.data = (STRPTR) realloc(pg->pg_SpeciesString.data,
855                        pg->pg_SpeciesString.size);
856printf("%li entries used %li bytes (%li MB) of buffer: %5.2f byte per entry\n", 
857        entryCount, (100000000-buflen), (100000000-buflen) >> 20, (double)(100000000-buflen)/(double)entryCount);
858  return(&pg->pg_SpeciesString);
859}
860/* \\\ */
861
862/* /// "MP_count_all_species()" */
863extern "C" int MP_count_all_species(PT_local *)
864{
865  struct PTPanGlobal *pg = PTPanGlobalPtr;
866  printf("EXTERN: MP_count_all_species\n");
867  return(pg->pg_NumSpecies);
868}
869/* \\\ */
870
871
Note: See TracBrowser for help on using the repository browser.