1 | #include "globalprefs.h" |
---|
2 | #include "types.h" |
---|
3 | #include "dlist.h" |
---|
4 | #include "hooks.h" |
---|
5 | |
---|
6 | #include <PT_com.h> |
---|
7 | #include <arbdb.h> |
---|
8 | #include "pt_manualprotos.h" |
---|
9 | |
---|
10 | #define FILESTRUCTVERSION 0x0105 /* version<<8 and revision */ |
---|
11 | |
---|
12 | #define BENCHMARK |
---|
13 | |
---|
14 | /* compressed sequence codes */ |
---|
15 | #define SEQCODE_N 0 |
---|
16 | #define SEQCODE_A 1 |
---|
17 | #define SEQCODE_C 2 |
---|
18 | #define SEQCODE_G 3 |
---|
19 | #define SEQCODE_T 4 |
---|
20 | #define SEQCODE_DOT 5 |
---|
21 | #define SEQCODE_HYPHEN 6 |
---|
22 | #define SEQCODE_IGNORE 255 |
---|
23 | |
---|
24 | /* 5261276th prime, small enough for pg->pg_AlphaSize < 47 */ |
---|
25 | #define HASHPRIME 90809777 |
---|
26 | |
---|
27 | /* mismatch matrix and edit distance weights */ |
---|
28 | struct MismatchWeights |
---|
29 | { |
---|
30 | float mw_Replace[ALPHASIZE*ALPHASIZE]; /* costs for replacing one char by the other */ |
---|
31 | float mw_Insert[ALPHASIZE]; /* cost for inserting one char */ |
---|
32 | float mw_Delete[ALPHASIZE]; /* cost for deleting one char */ |
---|
33 | }; |
---|
34 | |
---|
35 | /* benchmarking/debug */ |
---|
36 | struct TimeStats |
---|
37 | { |
---|
38 | struct timeval ts_Init; /* first time stamp after init */ |
---|
39 | struct timeval ts_End; /* total time */ |
---|
40 | struct timeval ts_Last; /* last stamp */ |
---|
41 | struct timeval ts_Now; /* temporary stamp */ |
---|
42 | |
---|
43 | ULONG ts_CollectDB; /* scanning for species */ |
---|
44 | ULONG ts_MergeDB; /* merging database */ |
---|
45 | ULONG ts_PrefixScan; /* partition prefix scanning */ |
---|
46 | ULONG ts_MemTree; /* memory tree generation */ |
---|
47 | ULONG ts_TreeStats; /* depth, short edge, long edge */ |
---|
48 | ULONG ts_LongDictPre; /* long edge dict presorting */ |
---|
49 | ULONG ts_LongDictBuild; /* long edge dict building */ |
---|
50 | ULONG ts_Reloc; /* relocation */ |
---|
51 | ULONG ts_Writing; /* writing of tree to disk */ |
---|
52 | ULONG ts_TotalBuild; /* total time spent for building */ |
---|
53 | ULONG ts_Hits; /* total number of candidates */ |
---|
54 | ULONG ts_UnsafeHits; /* number of hits being verified */ |
---|
55 | ULONG ts_UnsafeKilled; /* number of false positives */ |
---|
56 | ULONG ts_DupsKilled; /* duplicates filtered */ |
---|
57 | ULONG ts_CrossBoundKilled; /* hits, that were filtered due to cross boundary */ |
---|
58 | ULONG ts_DotsKilled; /* filtered hits that were in non-coding areas */ |
---|
59 | ULONG ts_OutHits; /* number of output lines generated */ |
---|
60 | ULONG ts_CandSetTime; /* time for generating candidates */ |
---|
61 | ULONG ts_OutputTime; /* time for generating output */ |
---|
62 | }; |
---|
63 | |
---|
64 | /* global persistant data structure */ |
---|
65 | struct PTPanGlobal |
---|
66 | { |
---|
67 | struct Node pg_Node; /* linkage */ |
---|
68 | |
---|
69 | /* communication stuff */ |
---|
70 | PT_main *pg_AISC; /* main communication pointer */ |
---|
71 | struct arb_params *pg_ArbParams; |
---|
72 | STRPTR pg_ServerName; /* hostname of server */ |
---|
73 | struct Hs_struct *pg_ComSocket; /* the communication socket */ |
---|
74 | |
---|
75 | BOOL pg_UseStdSfxTree; /* selects between PTPan and SfxTree */ |
---|
76 | FILE *pg_IndexFile; /* file handle for main index file */ |
---|
77 | |
---|
78 | STRPTR pg_DBName; /* DataBase name */ |
---|
79 | STRPTR pg_IndexName; /* name of primary (?) index file */ |
---|
80 | |
---|
81 | /* Main data base information */ |
---|
82 | GBDATA *pg_MainDB; /* ARBDB interface */ |
---|
83 | GBDATA *pg_SaiData; |
---|
84 | |
---|
85 | STRPTR pg_AlignmentName; /* Name of the alignment in DB */ |
---|
86 | |
---|
87 | GBDATA *pg_SpeciesData; /* DB pointer to species data */ |
---|
88 | ULLONG pg_TotalSeqSize; /* total size of sequences */ |
---|
89 | ULLONG pg_TotalSeqCompressedSize; // total size of compressed Seq. Data (with '.' and '-') in byte |
---|
90 | ULLONG pg_TotalRawSize; /* total size of data tuples in species */ |
---|
91 | UWORD pg_TotalRawBits; /* number of bits required to store a position pointer */ |
---|
92 | ULONG pg_MaxBaseLength; /* maximum base length over all species */ |
---|
93 | ULONG pg_AllHashSum; /* hash sum over everything to checksum integrity */ |
---|
94 | |
---|
95 | ULONG pg_NumSpecies; /* number of species accounted for */ |
---|
96 | struct List pg_Species; /* list of species data */ |
---|
97 | |
---|
98 | /* stuff for building the index and partitions */ |
---|
99 | ULONG *pg_MergedRawData; /* all compressed and merged sequence data */ |
---|
100 | ULONG pg_MaxPartitionSize; /* maximum allowed number of tuples in each partition */ |
---|
101 | ULONG *pg_HistoTable; /* prefix histogramm */ |
---|
102 | UWORD pg_NumPartitions; /* number of partitions generated */ |
---|
103 | struct List pg_Partitions; /* prefix partitions */ |
---|
104 | UWORD pg_MaxPrefixLen; /* maximum used prefix length */ |
---|
105 | struct PTPanPartition **pg_PartitionLookup; /* quick lookup table */ |
---|
106 | |
---|
107 | /* precalculated Tables for speedup */ |
---|
108 | UWORD pg_AlphaSize; /* size of alphabet */ |
---|
109 | ULONG pg_PowerTable[MAXCODEFITLONG+1]; /* stores powers of ALPHASIZE^i */ |
---|
110 | UWORD pg_BitsShiftTable[MAXCODEFITLONG+1]; /* how many bits to shift left to get valid code */ |
---|
111 | UWORD pg_BitsUseTable[MAXCODEFITLONG+1]; /* how many bits are required to store ALPHASIZE^i */ |
---|
112 | ULONG pg_BitsMaskTable[MAXCODEFITLONG+1]; /* eof bit mask for i codes */ |
---|
113 | |
---|
114 | UWORD pg_BitsCountTable[256]; /* count how many bits are set */ |
---|
115 | |
---|
116 | UBYTE pg_CompressTable[256]; /* table to compress char to { N, A, C, G, T } */ |
---|
117 | UBYTE pg_DecompressTable[ALPHASIZE]; /* inverse mapping of above */ |
---|
118 | UBYTE pg_ComplementTable[ALPHASIZE]; /* table with A<->T, C<->G swapped */ |
---|
119 | UBYTE pg_SeqCodeValidTable[256]; /* 1 for valid char, 0 for invalid (-.) */ |
---|
120 | |
---|
121 | /* various stuff */ |
---|
122 | UWORD pg_PruneLength; /* Pruning control for depth and length */ |
---|
123 | BOOL pg_LowMemoryMode; /* whether to load and keep all sequences in memory */ |
---|
124 | ULONG pg_FreeMem; /* free memory in KB */ |
---|
125 | STRPTR pg_EcoliSeq; /* Ecoli sequence data */ |
---|
126 | ULONG pg_EcoliSeqSize; /* length of ecoli sequence */ |
---|
127 | ULONG *pg_EcoliBaseTable; /* quick table lookup for ecoli base position */ |
---|
128 | |
---|
129 | /* species name hashing */ |
---|
130 | struct BinTree *pg_SpeciesBinTree; /* tree to find a species by its absolute offset */ |
---|
131 | struct PTPanSpecies **pg_SpeciesMap; /* direct mapping to support legacy stuff */ |
---|
132 | |
---|
133 | GB_HASH *pg_SpeciesNameHash; /* species name to int */ |
---|
134 | |
---|
135 | struct CacheHandler *pg_SpeciesCache; /* alignment caching */ |
---|
136 | struct CacheHandler *pg_PartitionCache; /* partition caching */ |
---|
137 | |
---|
138 | /* mismatch matrix */ |
---|
139 | struct MismatchWeights pg_MismatchWeights; /* matrix containing the default mismatch values */ |
---|
140 | struct MismatchWeights pg_NoWeights; /* matrix containing the default mismatch values */ |
---|
141 | |
---|
142 | /* output stuff */ |
---|
143 | PT_local *pg_SearchPrefs; /* search prefs from GUI */ |
---|
144 | bytestring pg_ResultString; /* complete merged output string (header/name/format) */ |
---|
145 | bytestring pg_ResultMString; /* complete merged output string (name/mismatch/wmis) */ |
---|
146 | bytestring pg_SpeciesString; /* complete merged species string (species) */ |
---|
147 | |
---|
148 | bytestring pg_UnknownSpecies; /* string containing the unknown species */ |
---|
149 | |
---|
150 | char pg_TempBuffer[1024]; /* temporary buffer */ |
---|
151 | |
---|
152 | /* benchmarking */ |
---|
153 | struct TimeStats pg_Bench; /* Benchmarking data structure */ |
---|
154 | |
---|
155 | /* Command line flags */ |
---|
156 | UWORD pg_verbose; /* Verbosity level to stdout for debugging */ |
---|
157 | }; |
---|
158 | |
---|
159 | extern struct PTPanGlobal *PTPanGlobalPtr; |
---|
160 | |
---|
161 | /* information on a species */ |
---|
162 | struct PTPanSpecies |
---|
163 | { |
---|
164 | struct Node ps_Node; /* linkage */ |
---|
165 | ULONG ps_Num; /* species number for map */ |
---|
166 | GBDATA *ps_SpeciesDB; /* database pointer */ |
---|
167 | GBDATA *ps_SeqDataDB; /* alignment data pointer */ |
---|
168 | STRPTR ps_Name; /* short name of species */ |
---|
169 | STRPTR ps_FullName; /* long name of species */ |
---|
170 | |
---|
171 | STRPTR ps_SeqData; /* original sequence data */ |
---|
172 | ULONG ps_SeqDataSize; /* length of original sequence data */ |
---|
173 | ULONG *ps_RawData; /* compressed sequence data pointer */ |
---|
174 | ULONG ps_RawDataSize; /* length of alignment data (filtered) */ |
---|
175 | ULONG ps_SeqHash; /* sequence hash sum */ |
---|
176 | |
---|
177 | BOOL ps_IsGroup; /* marked for probe design */ |
---|
178 | BOOL ps_Obsolete; /* this sequence has been removed from the DB, ignore */ |
---|
179 | ULLONG ps_AbsOffset; /* absolute offset in the resulting raw image */ |
---|
180 | ULONG ps_SerialTouch; /* last time touched (for duplicate elimation) */ |
---|
181 | struct CacheNode *ps_CacheNode; /* caching information node */ |
---|
182 | |
---|
183 | STRPTR ps_test; |
---|
184 | UBYTE *ps_SeqDataCompressed; // compressed Seq. Data (with '.' and '-') |
---|
185 | ULONG ps_SeqDataCompressedSize; // size in bits (includes end flag 111) |
---|
186 | }; |
---|
187 | |
---|
188 | /* index partition structure (memory) */ |
---|
189 | struct PTPanPartition |
---|
190 | { |
---|
191 | struct Node pp_Node; /* linkage */ |
---|
192 | struct PTPanGlobal *pp_PTPanGlobal; /* up link */ |
---|
193 | ULONG pp_ID; /* partition id, e.g. used for filename */ |
---|
194 | ULONG pp_Prefix; /* compressed prefix code */ |
---|
195 | ULONG pp_PrefixLen; /* length of prefix code */ |
---|
196 | STRPTR pp_PrefixSeq; /* prefix sequence */ |
---|
197 | ULONG pp_Size; /* number of nodes for this prefix */ |
---|
198 | ULLONG pp_RawOffset; /* offset of this partition inside the total raw data */ |
---|
199 | |
---|
200 | BOOL pp_Done; /* flag to check, if this partition has been done yet */ |
---|
201 | |
---|
202 | /* variables for alternate suffix tree implementation */ |
---|
203 | struct StdSfxNode *pp_StdSfxNodes; /* suffix node array */ |
---|
204 | |
---|
205 | /* intermediate tree in memory */ |
---|
206 | struct SfxNode *pp_SfxRoot; /* root of all evil */ |
---|
207 | ULONG pp_SfxMemorySize; /* size of the node array */ |
---|
208 | UBYTE *pp_SfxNodes; /* suffix node array */ |
---|
209 | ULONG pp_Sfx2EdgeOffset; /* offset for two edge nodes */ |
---|
210 | ULONG pp_SfxNEdgeOffset; /* offset for five edge nodes */ |
---|
211 | ULONG pp_NumBigNodes; /* number of big nodes */ |
---|
212 | ULONG pp_NumSmallNodes; /* number of small nodes */ |
---|
213 | ULONG pp_MaxTreeDepth; /* maximum measured tree depth */ |
---|
214 | ULONG pp_TreePruneDepth; /* at which depth should the tree be pruned? */ |
---|
215 | ULONG pp_TreePruneLength; /* length at which tree should be pruned? */ |
---|
216 | ULONG pp_PruneLeafCnt; /* count of leaves after pruning */ |
---|
217 | |
---|
218 | UBYTE *pp_VerifyArray; /* bitarray to check, if all positions are found */ |
---|
219 | |
---|
220 | struct TreeLevelStats *pp_LevelStats; /* stats for each level */ |
---|
221 | |
---|
222 | ULONG *pp_QuickPrefixLookup; /* hash table lookup */ |
---|
223 | ULONG pp_QuickPrefixCount; /* number of time the qpl was taken */ |
---|
224 | |
---|
225 | struct HuffCode *pp_BranchCode; /* branch combination histogram */ |
---|
226 | |
---|
227 | /* edges huffman code stuff */ |
---|
228 | ULONG pp_EdgeCount; /* number of edges in the final tree */ |
---|
229 | ULONG pp_LongEdgeCount; /* number of long edges */ |
---|
230 | |
---|
231 | struct HuffCode *pp_ShortEdgeCode; /* short edge combination histogram */ |
---|
232 | struct SfxNode **pp_LongEdges; /* sorted array of tree pointers to long edges */ |
---|
233 | |
---|
234 | STRPTR pp_LongDict; /* Dictionary for long edges */ |
---|
235 | ULONG pp_LongDictSize; /* allocation size */ |
---|
236 | UWORD pp_LongRelPtrBits; /* bits to use to reference a long edge offset */ |
---|
237 | |
---|
238 | struct HashArray *pp_LongDictHash; /* lookup hash array to speed up search */ |
---|
239 | ULONG pp_LongDictHashSize; /* size of hash */ |
---|
240 | |
---|
241 | ULONG pp_LongEdgeLenSize; /* size of the huffman array (longest length + 1) */ |
---|
242 | struct HuffCode *pp_LongEdgeLenCode; /* long edges length histogram */ |
---|
243 | |
---|
244 | ULONG *pp_LongDictRaw; /* compressed long dictionary */ |
---|
245 | ULONG pp_LongDictRawSize; /* compressed long dictionary size on disk */ |
---|
246 | |
---|
247 | /* leaf collection */ |
---|
248 | LONG *pp_LeafBuffer; /* temporary leaf buffer */ |
---|
249 | ULONG pp_LeafBufferSize; /* size of leaf buffer */ |
---|
250 | LONG *pp_LeafBufferPtr; /* current pointer position */ |
---|
251 | |
---|
252 | /* file saving */ |
---|
253 | STRPTR pp_PartitionName; /* file name of partition */ |
---|
254 | FILE *pp_PartitionFile; /* file handle for partition */ |
---|
255 | ULONG pp_TraverseTreeRoot; /* start of in order traversal (sn_Parent) */ |
---|
256 | ULONG pp_DiskTreeSize; /* backward position on disk (growing size) */ |
---|
257 | ULONG pp_DiskNodeCount; /* number of inner nodes written */ |
---|
258 | ULONG pp_DiskNodeSpace; /* memory used for nodes */ |
---|
259 | ULONG pp_DiskLeafCount; /* number of inner leaf nodes */ |
---|
260 | ULONG pp_DiskLeafSpace; /* memory used for leaf nodes */ |
---|
261 | ULONG pp_DiskOuterLeaves; /* number of leaf positions saved in arrays */ |
---|
262 | ULONG pp_DiskIdxSpace; /* size of file on disk */ |
---|
263 | UBYTE *pp_DiskBuffer; /* Buffer for writing tree */ |
---|
264 | ULONG pp_DiskBufferSize; /* Size of disk buffer */ |
---|
265 | ULONG pp_DiskPos; /* Fill state of buffer */ |
---|
266 | UBYTE *pp_DiskTree; /* start of compressed tree */ |
---|
267 | UBYTE *pp_MapFileBuffer; /* buffer for mapped file */ |
---|
268 | ULONG pp_MapFileSize; /* size of virtual memory */ |
---|
269 | UBYTE *pp_StdSfxMapBuffer; /* buffer for mapped source text (StdSfx) */ |
---|
270 | |
---|
271 | /* node decrunching */ |
---|
272 | struct HuffTree *pp_BranchTree; /* branch combination tree */ |
---|
273 | struct HuffTree *pp_ShortEdgeTree; /* short edge combination tree */ |
---|
274 | struct HuffTree *pp_LongEdgeLenTree; /* long edges length tree */ |
---|
275 | |
---|
276 | struct CacheNode *pp_CacheNode; /* caching information node */ |
---|
277 | }; |
---|
278 | |
---|
279 | #define RELOFFSETBITS 28 |
---|
280 | #define RELOFFSETMASK ((1UL << RELOFFSETBITS) - 1) |
---|
281 | #define LEAFBIT 31 |
---|
282 | #define LEAFMASK 0x80000000 |
---|
283 | |
---|
284 | /* suffix tree stub node for long edge extra intervals */ |
---|
285 | struct SfxNodeStub |
---|
286 | { |
---|
287 | ULONG sn_StartPos; /* start position of edge (or prefix or dictionary position) */ |
---|
288 | UWORD sn_EdgeLen; /* length of edge */ |
---|
289 | UWORD sn_Dummy; /* alpha mask */ |
---|
290 | }; |
---|
291 | |
---|
292 | /* suffix tree inner node for two edges */ |
---|
293 | struct SfxNode2Edges |
---|
294 | { |
---|
295 | ULONG sn_StartPos; /* start position */ |
---|
296 | UWORD sn_EdgeLen; /* length of edge */ |
---|
297 | UWORD sn_AlphaMask; /* alpha mask */ |
---|
298 | ULONG sn_Parent; /* pointer to parent offset (upper 4 bits for edge count) */ |
---|
299 | ULONG sn_Children[2]; /* array of children (lower 28 bits for array offset, |
---|
300 | upper 4 bits for edge id and terminal bit */ |
---|
301 | }; |
---|
302 | |
---|
303 | /* suffix tree inner node for five edges */ |
---|
304 | struct SfxNodeNEdges |
---|
305 | { |
---|
306 | ULONG sn_StartPos; /* start position */ |
---|
307 | UWORD sn_EdgeLen; /* length of edge */ |
---|
308 | UWORD sn_AlphaMask; /* alpha mask */ |
---|
309 | ULONG sn_Parent; /* pointer to parent offset (upper 4 bits for edge count) */ |
---|
310 | ULONG sn_Children[ALPHASIZE]; /* array of children (lower 28 bits for array |
---|
311 | offset ,upper 4 bits for edge id and terminal bit */ |
---|
312 | }; |
---|
313 | |
---|
314 | /* suffix tree node template */ |
---|
315 | struct SfxNode |
---|
316 | { |
---|
317 | ULONG sn_StartPos; /* start position of edge (or prefix or dictionary position) */ |
---|
318 | UWORD sn_EdgeLen; /* length of edge */ |
---|
319 | UWORD sn_AlphaMask; /* alpha mask */ |
---|
320 | ULONG sn_Parent; /* pointer to parent offset (upper 4 bits for edge count) */ |
---|
321 | ULONG sn_Children[1]; /* array of children */ |
---|
322 | }; |
---|
323 | |
---|
324 | /* alternate implementation: standard suffix tree */ |
---|
325 | struct StdSfxNode |
---|
326 | { |
---|
327 | ULONG ssn_StartPos; /* start position of edge */ |
---|
328 | ULONG ssn_EdgeLen; /* length of edge */ |
---|
329 | ULONG ssn_FirstChild; /* first child offset */ |
---|
330 | ULONG ssn_NextSibling; /* next sibling offset */ |
---|
331 | ULONG ssn_Parent; /* parent node offset */ |
---|
332 | ULONG ssn_Prime; /* link node offset */ |
---|
333 | }; |
---|
334 | |
---|
335 | /* StdSfxNode truncated */ |
---|
336 | struct StdSfxNodeOnDisk |
---|
337 | { |
---|
338 | ULONG ssn_StartPos; /* start position of edge */ |
---|
339 | ULONG ssn_EdgeLen; /* length of edge */ |
---|
340 | ULONG ssn_FirstChild; /* first child offset */ |
---|
341 | ULONG ssn_NextSibling; /* next sibling offset */ |
---|
342 | }; |
---|
343 | |
---|
344 | /* tree statistics */ |
---|
345 | struct TreeLevelStats |
---|
346 | { |
---|
347 | ULONG tls_NodeCount; /* number of nodes on this level */ |
---|
348 | ULONG tls_LeafCount; /* number of leaves on this level */ |
---|
349 | ULONG tls_TotalNodeCount; /* number of nodes on this level and below */ |
---|
350 | ULONG tls_TotalLeafCount; /* number of leaves on this level and below */ |
---|
351 | ULONG tls_LevelSize; /* number of bytes used on for this level */ |
---|
352 | ULONG tls_Offset; /* current position < tls_LevelSize */ |
---|
353 | }; |
---|
354 | |
---|
355 | /* decrunched tree node data structure */ |
---|
356 | struct TreeNode |
---|
357 | { |
---|
358 | struct PTPanPartition *tn_PTPanPartition; /* uplinking */ |
---|
359 | ULONG tn_Pos; /* absolute tree position */ |
---|
360 | ULONG tn_Size; /* size of the uncompressed node in bytes */ |
---|
361 | UWORD tn_Level; /* level of the tree node */ |
---|
362 | |
---|
363 | UWORD tn_ParentSeq; /* branch that lead there */ |
---|
364 | ULONG tn_ParentPos; /* position of parent node */ |
---|
365 | struct TreeNode *tn_Parent; /* parent node */ |
---|
366 | UWORD tn_TreeOffset; /* sum of the parent edges (i.e. relative string position) */ |
---|
367 | UWORD tn_EdgeLen; /* length of edge */ |
---|
368 | STRPTR tn_Edge; /* decompressed edge */ |
---|
369 | |
---|
370 | ULONG tn_Children[ALPHASIZE]; /* absolute position of the children */ |
---|
371 | ULONG tn_NumLeaves; /* number of leaves in this node */ |
---|
372 | ULONG tn_Leaves[1]; /* variable size of leaf array */ |
---|
373 | }; |
---|
374 | |
---|
375 | /* values for sq_SortMode */ |
---|
376 | #define SORT_HITS_NOWEIGHT 0 /* use mismatch count */ |
---|
377 | #define SORT_HITS_WEIGHTED 1 /* use weighted error */ |
---|
378 | |
---|
379 | /* automata state for search algorithm */ |
---|
380 | struct SearchQueryState |
---|
381 | { |
---|
382 | struct TreeNode *sqs_TreeNode; /* tree node */ |
---|
383 | ULONG sqs_SourcePos; /* current position within source string (linear only) */ |
---|
384 | ULONG sqs_QueryPos; /* current position within query */ |
---|
385 | UWORD sqs_ReplaceCount; /* number of replace operations so far */ |
---|
386 | UWORD sqs_InsertCount; /* number of insert operations so far */ |
---|
387 | UWORD sqs_DeleteCount; /* number of delete operations so far */ |
---|
388 | UWORD sqs_NCount; /* number of N-sequence codes encountered */ |
---|
389 | float sqs_ErrorCount; /* errors currently found */ |
---|
390 | }; |
---|
391 | |
---|
392 | /* parameter block and variables for a query */ |
---|
393 | struct SearchQuery |
---|
394 | { |
---|
395 | struct Node sq_Node; /* linkage */ |
---|
396 | struct PTPanGlobal *sq_PTPanGlobal; /* up link */ |
---|
397 | struct PTPanPartition *sq_PTPanPartition; /* up link */ |
---|
398 | |
---|
399 | /* prefs/settings */ |
---|
400 | STRPTR sq_SourceSeq; /* for linear comparision ONLY */ |
---|
401 | STRPTR sq_Query; /* query string */ |
---|
402 | ULONG sq_QueryLen; /* length of the query string */ |
---|
403 | BOOL sq_Reversed; /* query is reversed */ |
---|
404 | BOOL sq_AllowReplace; /* allow replace operations */ |
---|
405 | BOOL sq_AllowInsert; /* allow inserting operations */ |
---|
406 | BOOL sq_AllowDelete; /* allow delete operations */ |
---|
407 | ULONG sq_KillNSeqsAt; /* if string has too many Ns, kill it */ |
---|
408 | float sq_MinorMisThres; /* threshold for small letter mismatch output */ |
---|
409 | struct MismatchWeights *sq_MismatchWeights; /* pointer to matrix for mismatches */ |
---|
410 | float sq_MaxErrors; /* maximum allowed errors */ |
---|
411 | double* sq_PosWeight; // mismatch multiplier (Gaussian like distribution over search string) |
---|
412 | |
---|
413 | /* results */ |
---|
414 | struct HashArray *sq_HitsHash; /* hash array to avoid double entries */ |
---|
415 | ULONG sq_HitsHashSize; /* size of hash array */ |
---|
416 | struct List sq_Hits; /* list of results */ |
---|
417 | ULONG sq_NumHits; /* number of hits stored in array */ |
---|
418 | UWORD sq_SortMode; /* describes the ordering for sorting */ |
---|
419 | |
---|
420 | struct SearchQueryState sq_State; /* running variables */ |
---|
421 | }; |
---|
422 | |
---|
423 | #define QHF_UNSAFE 0x0001 /* this hit must be verified */ |
---|
424 | #define QHF_REVERSED 0x0002 /* this hit was found in the reversed sequence */ |
---|
425 | #define QHF_ISVALID 0x8000 /* it's a valid hit */ |
---|
426 | |
---|
427 | /* data stored for a candidate */ |
---|
428 | struct QueryHit |
---|
429 | { |
---|
430 | struct Node qh_Node; /* linkage */ |
---|
431 | ULLONG qh_AbsPos; /* absolute position where hit was found */ |
---|
432 | UWORD qh_Flags; /* is this still valid */ |
---|
433 | UWORD qh_ReplaceCount; /* Amount of replace operations required */ |
---|
434 | UWORD qh_InsertCount; /* Amount of insert operations required */ |
---|
435 | UWORD qh_DeleteCount; /* Amount of delete operations required */ |
---|
436 | float qh_ErrorCount; /* errors that were needed to reach that hit */ |
---|
437 | struct PTPanSpecies *qh_Species; /* pointer to species this hit is in */ |
---|
438 | }; |
---|
439 | |
---|
440 | /* data tuple for collected hits */ |
---|
441 | struct HitTuple |
---|
442 | { |
---|
443 | ULLONG ht_AbsPos; /* absolute position of hit */ |
---|
444 | struct PTPanSpecies *ht_Species; /* link to species */ |
---|
445 | }; |
---|
446 | |
---|
447 | /* probe candidate */ |
---|
448 | struct DesignHit |
---|
449 | { |
---|
450 | struct Node dh_Node; /* linkage */ |
---|
451 | STRPTR dh_ProbeSeq; /* sequence of the probe */ |
---|
452 | ULONG dh_GroupHits; /* number of species covered by the hit */ |
---|
453 | ULONG dh_NonGroupHits; /* number of non group hits inside this hit */ |
---|
454 | ULONG dh_Hairpin; /* number of hairpin bonds */ |
---|
455 | double dh_Temp; /* temperature */ |
---|
456 | UWORD dh_GCContent; /* number of G and C nucleotides in the hit */ |
---|
457 | ULONG dh_NumMatches; /* number of hits in this puddle */ |
---|
458 | struct HitTuple *dh_Matches; /* Absolute pos of all hits */ |
---|
459 | struct SearchQuery *dh_SearchQuery; /* search query for probe quality calculation */ |
---|
460 | ULONG dh_NonGroupHitsPerc[PERC_SIZE]; /* number of nongroup hits with decreasing temperature */ |
---|
461 | }; |
---|
462 | |
---|
463 | /* data structure for probe design */ |
---|
464 | struct DesignQuery |
---|
465 | { |
---|
466 | struct Node dq_Node; /* linkage */ |
---|
467 | struct PTPanGlobal *dq_PTPanGlobal; /* up link */ |
---|
468 | struct PTPanPartition *dq_PTPanPartition; /* up link */ |
---|
469 | ULONG dq_Serial; /* running number for duplicate detection */ |
---|
470 | UWORD dq_ProbeLength; /* length of the probe */ |
---|
471 | ULONG dq_MarkedSpecies; /* number of marked species */ |
---|
472 | ULONG dq_MinGroupHits; /* minimum species covered by the hit */ |
---|
473 | ULONG dq_MaxNonGroupHits; /* number of non group hits allowed */ |
---|
474 | ULONG dq_MaxHairpin; /* maximum number of hairpin bonds */ |
---|
475 | double dq_MinTemp; /* minimum temperature */ |
---|
476 | double dq_MaxTemp; /* maximum temperature */ |
---|
477 | UWORD dq_MinGC; /* minimum number of G and C nucleotides in the hit */ |
---|
478 | UWORD dq_MaxGC; /* maximum number of G and C nucleotides in the hit */ |
---|
479 | ULONG dq_MaxHits; /* maximum number of matches */ |
---|
480 | ULONG dq_NumHits; /* number of matches */ |
---|
481 | struct List dq_Hits; /* list of matches */ |
---|
482 | |
---|
483 | struct DesignHit dq_TempHit; /* temporary memory to avoid too many alloc/free's */ |
---|
484 | ULONG dq_TempMemSize; /* memory allocated */ |
---|
485 | |
---|
486 | struct TreeNode *dq_TreeNode; /* current tree node traversed */ |
---|
487 | |
---|
488 | PT_pdc *dq_PDC; /* legacy link */ |
---|
489 | |
---|
490 | }; |
---|
491 | |
---|
492 | /* cache handler information node (private) */ |
---|
493 | struct CacheHandler |
---|
494 | { |
---|
495 | struct Node ch_Node; /* linkage */ |
---|
496 | struct List ch_UsedNodes; /* list of loaded nodes */ |
---|
497 | struct List ch_FreeNodes; /* list of free nodes */ |
---|
498 | APTR ch_UserData; /* public: user data pointer */ |
---|
499 | ULONG ch_SwapCount; /* swapping count */ |
---|
500 | |
---|
501 | /* private stuff */ |
---|
502 | BOOL ch_CacheDisabled; /* do not cache new stuff */ |
---|
503 | ULONG ch_AccessID; /* running sequence ID */ |
---|
504 | ULONG ch_MemUsage; /* memory used for this cache list */ |
---|
505 | ULONG ch_MaxCapacity; /* how much to hold in this cache maximum */ |
---|
506 | |
---|
507 | BOOL (*ch_LoadFunc)(struct CacheHandler *, APTR); /* callback hook to load data */ |
---|
508 | |
---|
509 | BOOL (*ch_UnloadFunc)(struct CacheHandler *, APTR); /* callback hook to unload data */ |
---|
510 | ULONG (*ch_SizeFunc)(struct CacheHandler *, APTR); /* callback hook to determine size of data */ |
---|
511 | |
---|
512 | struct CacheNode *ch_LastNode; /* for unloading (in no caching mode) */ |
---|
513 | }; |
---|
514 | |
---|
515 | /* node containing statistical and management data for caching */ |
---|
516 | struct CacheNode |
---|
517 | { |
---|
518 | struct Node cn_Node; /* linking node */ |
---|
519 | APTR cn_UserData; /* link to the user node this is attached to */ |
---|
520 | ULONG cn_LastUseID; /* access sequence ID of last use */ |
---|
521 | BOOL cn_Loaded; /* is loaded */ |
---|
522 | }; |
---|
523 | |
---|
524 | /* huffman tree node */ |
---|
525 | struct HuffTree |
---|
526 | { |
---|
527 | struct HuffTree *ht_Child[2]; /* children */ |
---|
528 | ULONG ht_ID; /* ID to return */ |
---|
529 | ULONG ht_Codec; /* complete codec */ |
---|
530 | UWORD ht_CodeLength; /* code length */ |
---|
531 | }; |
---|
532 | |
---|
533 | /* huffman table entry */ |
---|
534 | struct HuffCode |
---|
535 | { |
---|
536 | ULONG hc_Weight; /* weight of item */ |
---|
537 | ULONG hc_Codec; /* actual bits (in lsb) */ |
---|
538 | UWORD hc_CodeLength; /* lengths in bits */ |
---|
539 | }; |
---|
540 | |
---|
541 | /* temporary internal node type */ |
---|
542 | struct HuffCodeInternal |
---|
543 | { |
---|
544 | ULONG hc_ID; /* code id */ |
---|
545 | ULONG hc_Weight; /* weight of item */ |
---|
546 | UWORD hc_Left; /* ID of left parent */ |
---|
547 | UWORD hc_Right; /* ID of right parent */ |
---|
548 | }; |
---|
549 | |
---|
550 | /* dynamic hashing struct */ |
---|
551 | struct HashArray |
---|
552 | { |
---|
553 | struct HashEntry *ha_Array; /* Array of entries */ |
---|
554 | ULONG ha_Size; /* Size of array */ |
---|
555 | ULONG ha_Used; /* Number of entries used */ |
---|
556 | ULONG ha_InitialSize; /* Initial size */ |
---|
557 | }; |
---|
558 | |
---|
559 | /* static hashing hash entry */ |
---|
560 | struct HashEntry |
---|
561 | { |
---|
562 | ULONG he_Key; /* Hash Key */ |
---|
563 | ULONG he_Data; /* Auxilliary data */ |
---|
564 | }; |
---|