1 | #include <stdio.h> |
---|
2 | #include <stdlib.h> |
---|
3 | // #include <malloc.h> |
---|
4 | #include <string.h> |
---|
5 | #include <ctype.h> |
---|
6 | #include <arbdb.h> |
---|
7 | #include <arbdbt.h> |
---|
8 | |
---|
9 | /* count different kind of nucleotides for each alignment position */ |
---|
10 | |
---|
11 | #define MAXLETTER ('Z'-'A'+1) |
---|
12 | #define RESULTNAME "COUNTED_CHARS" |
---|
13 | |
---|
14 | int main(int argc, char **argv){ |
---|
15 | const char *db_name = ":"; // default name -> link to server |
---|
16 | if (argc >1) db_name = argv[1]; // get the name of the database |
---|
17 | int is_amino = 0; |
---|
18 | |
---|
19 | printf("Counting the number of different chars of all marked sequences\n"); |
---|
20 | |
---|
21 | GBDATA *gb_main = GB_open(":","rw"); // open database |
---|
22 | if (!gb_main){ |
---|
23 | GB_print_error(); |
---|
24 | return -1; |
---|
25 | } |
---|
26 | |
---|
27 | GB_begin_transaction(gb_main); // open transaction |
---|
28 | |
---|
29 | char *alignment_name = GBT_get_default_alignment(gb_main); // info about sequences |
---|
30 | int alignment_len = GBT_get_alignment_len(gb_main,alignment_name); |
---|
31 | is_amino = GBT_is_alignment_protein(gb_main,alignment_name); |
---|
32 | char filter[256]; |
---|
33 | if (is_amino){ |
---|
34 | memset(filter,1,256); |
---|
35 | filter[(unsigned char)'B'] = 0; |
---|
36 | filter[(unsigned char)'J'] = 0; |
---|
37 | filter[(unsigned char)'O'] = 0; |
---|
38 | filter[(unsigned char)'U'] = 0; |
---|
39 | filter[(unsigned char)'Z'] = 0; |
---|
40 | }else{ |
---|
41 | memset(filter,0,256); |
---|
42 | filter[(unsigned char)'A'] = 1; |
---|
43 | filter[(unsigned char)'C'] = 1; |
---|
44 | filter[(unsigned char)'G'] = 1; |
---|
45 | filter[(unsigned char)'T'] = 1; |
---|
46 | filter[(unsigned char)'U'] = 1; |
---|
47 | } |
---|
48 | |
---|
49 | // malloc and clear arrays for counting characters (only letters) |
---|
50 | int *counters[MAXLETTER]; |
---|
51 | int i; |
---|
52 | for (i=0;i<MAXLETTER;i++){ |
---|
53 | counters[i] = (int *)calloc(sizeof(int),alignment_len); |
---|
54 | } |
---|
55 | |
---|
56 | |
---|
57 | // loop over all marked species |
---|
58 | int slider = 0; |
---|
59 | int all_marked = GBT_count_marked_species(gb_main); |
---|
60 | |
---|
61 | for (GBDATA *gb_species = GBT_first_marked_species(gb_main); |
---|
62 | gb_species; |
---|
63 | gb_species = GBT_next_marked_species(gb_species)){ |
---|
64 | if ((slider++)%10 == 0) printf("%i:%i\n",slider,all_marked); |
---|
65 | |
---|
66 | GBDATA *gb_ali = GB_entry(gb_species, alignment_name); // search the sequence database entry ( ali_xxx/data ) |
---|
67 | if (!gb_ali) continue; |
---|
68 | GBDATA *gb_data = GB_entry(gb_ali, "data"); |
---|
69 | if(!gb_data) continue; |
---|
70 | |
---|
71 | const char *seq = GB_read_char_pntr(gb_data); |
---|
72 | if (!seq) continue; |
---|
73 | |
---|
74 | unsigned char c; |
---|
75 | for ( i=0; (i< alignment_len) && (c = *(seq)); i++, seq++){ |
---|
76 | if (!isalpha(c)) continue; |
---|
77 | c = toupper(c); |
---|
78 | if (filter[c]){ |
---|
79 | counters[ c - 'A' ][i] ++; |
---|
80 | } |
---|
81 | } |
---|
82 | } |
---|
83 | |
---|
84 | char *result = (char *)calloc(sizeof(char), alignment_len + 1); |
---|
85 | for (i=0; i<alignment_len; i++) { |
---|
86 | int sum = 0; |
---|
87 | for (int l = 0; l < MAXLETTER; l++){ |
---|
88 | if (counters[l][i]>0) sum++; |
---|
89 | } |
---|
90 | result[i] = sum<10 ? '0'+sum : 'A'-10+sum; |
---|
91 | } |
---|
92 | // save result as SAI COUNTED_CHARS |
---|
93 | { |
---|
94 | GBDATA *gb_sai = GBT_find_or_create_SAI(gb_main,RESULTNAME); |
---|
95 | if (!gb_sai) { |
---|
96 | GB_print_error(); |
---|
97 | return -1; |
---|
98 | } |
---|
99 | GBDATA *gb_data = GBT_add_data(gb_sai, alignment_name, "data", GB_STRING); |
---|
100 | GB_write_string(gb_data, result); |
---|
101 | } |
---|
102 | |
---|
103 | GB_commit_transaction(gb_main); // commit it |
---|
104 | GB_close(gb_main); // politely disconnect from server |
---|
105 | return 0; |
---|
106 | |
---|
107 | } |
---|