| 1 | #include <stdio.h> |
|---|
| 2 | #include <stdlib.h> |
|---|
| 3 | #include <malloc.h> |
|---|
| 4 | #include <string.h> |
|---|
| 5 | #include <arbdb.h> |
|---|
| 6 | #include <arbdbt.h> |
|---|
| 7 | |
|---|
| 8 | #define ADD_LEN 10 |
|---|
| 9 | #define PRM_HASH_SIZE 1024 |
|---|
| 10 | #define PRM_HASH2_SIZE 512 |
|---|
| 11 | struct arb_prm_struct { |
|---|
| 12 | char **alignment_names; |
|---|
| 13 | int al_len; |
|---|
| 14 | int max_name; |
|---|
| 15 | GBDATA *gb_main; |
|---|
| 16 | char buffer[256]; |
|---|
| 17 | char *source; |
|---|
| 18 | int prmanz; |
|---|
| 19 | int prmlen; |
|---|
| 20 | int prmsmin; |
|---|
| 21 | GB_HASH *hash; |
|---|
| 22 | char **data; |
|---|
| 23 | int sp_count; |
|---|
| 24 | int key_cnt; |
|---|
| 25 | int one_key_cnt; |
|---|
| 26 | int reduce; |
|---|
| 27 | FILE *out; |
|---|
| 28 | char *outname; |
|---|
| 29 | } aprm; |
|---|
| 30 | |
|---|
| 31 | |
|---|
| 32 | void arb_prm_menu() |
|---|
| 33 | { |
|---|
| 34 | char **alignment_name; |
|---|
| 35 | int i; |
|---|
| 36 | printf(" Please select an Alignment:\n"); |
|---|
| 37 | for ( alignment_name = aprm.alignment_names,i=1; |
|---|
| 38 | *alignment_name; |
|---|
| 39 | alignment_name++,i++){ |
|---|
| 40 | printf("%i: %s\n",i,*alignment_name); |
|---|
| 41 | } |
|---|
| 42 | aprm.max_name = i; |
|---|
| 43 | gets(aprm.buffer); |
|---|
| 44 | i = atoi(aprm.buffer); |
|---|
| 45 | if ((i<1) || (i>=aprm.max_name)) { |
|---|
| 46 | printf ("ERROR: select %i out of range\n",i); |
|---|
| 47 | exit(-1); |
|---|
| 48 | } |
|---|
| 49 | aprm.source = aprm.alignment_names[i-1]; |
|---|
| 50 | printf( "This module will search for primers for all positions.\n" |
|---|
| 51 | " The best result is one primer for all (marked) taxa , the worst case\n" |
|---|
| 52 | " are n primers for n taxa.\n" |
|---|
| 53 | " Please specify the maximum number of primers:\n" |
|---|
| 54 | ); |
|---|
| 55 | gets(aprm.buffer); |
|---|
| 56 | i = atoi(aprm.buffer); |
|---|
| 57 | aprm.prmanz = i; |
|---|
| 58 | printf( "Select minimum length of a primer, the maximum will be (mimimum + %i)\n", |
|---|
| 59 | ADD_LEN); |
|---|
| 60 | gets(aprm.buffer); |
|---|
| 61 | i = atoi(aprm.buffer); |
|---|
| 62 | if ((i<4) || (i>30)) { |
|---|
| 63 | printf ("ERROR: select %i out of range\n",i); |
|---|
| 64 | exit(-1); |
|---|
| 65 | } |
|---|
| 66 | aprm.prmlen = i; |
|---|
| 67 | |
|---|
| 68 | printf( "There may be short sequences or/and deletes in full sequences\n" |
|---|
| 69 | " So a primer normally does not match all sequences\n" |
|---|
| 70 | " Specify minimum percentage of species (0-100 %%):\n"); |
|---|
| 71 | gets(aprm.buffer); |
|---|
| 72 | i = atoi(aprm.buffer); |
|---|
| 73 | if ((i<1) || (i>100)) { |
|---|
| 74 | printf ("ERROR: select %i out of range\n",i); |
|---|
| 75 | exit(-1); |
|---|
| 76 | } |
|---|
| 77 | aprm.prmsmin = i; |
|---|
| 78 | |
|---|
| 79 | printf( "Write output to file (enter \"\" to write to screen)\n"); |
|---|
| 80 | gets(aprm.buffer); |
|---|
| 81 | aprm.outname = strdup(aprm.buffer); |
|---|
| 82 | } |
|---|
| 83 | |
|---|
| 84 | char *arb_prm_read(int /*prmanz*/) |
|---|
| 85 | { |
|---|
| 86 | GBDATA *gb_presets; |
|---|
| 87 | GBDATA *gb_source; |
|---|
| 88 | GBDATA *gb_species; |
|---|
| 89 | GBDATA *gb_source_data; |
|---|
| 90 | GBDATA *gb_len; |
|---|
| 91 | int sp_count; |
|---|
| 92 | // char *flag; |
|---|
| 93 | char *data; |
|---|
| 94 | char *hdata; |
|---|
| 95 | // int size; |
|---|
| 96 | |
|---|
| 97 | gb_presets = GB_find(aprm.gb_main,"presets",0,down_level); |
|---|
| 98 | |
|---|
| 99 | gb_source = GB_find(gb_presets,"alignment_name",aprm.source,down_2_level); |
|---|
| 100 | gb_len = GB_find(gb_source,"alignment_len",0,this_level); |
|---|
| 101 | aprm.al_len = GB_read_int(gb_len); |
|---|
| 102 | |
|---|
| 103 | |
|---|
| 104 | sp_count = GBT_count_marked_species(aprm.gb_main); |
|---|
| 105 | |
|---|
| 106 | aprm.data = (char **)calloc(sp_count,sizeof(char *)); |
|---|
| 107 | sp_count = 0; |
|---|
| 108 | for ( gb_species = GBT_first_marked_species(aprm.gb_main); |
|---|
| 109 | gb_species; |
|---|
| 110 | gb_species = GBT_next_marked_species(gb_species) ){ |
|---|
| 111 | |
|---|
| 112 | gb_source = GB_find(gb_species,aprm.source,0,down_level); |
|---|
| 113 | if (!gb_source) continue; |
|---|
| 114 | gb_source_data = GB_find(gb_source,"data",0,down_level); |
|---|
| 115 | if (!gb_source_data) continue; |
|---|
| 116 | data = (char *)calloc(sizeof(char),aprm.al_len+1); |
|---|
| 117 | hdata = GB_read_char_pntr(gb_source_data); |
|---|
| 118 | if (!hdata) { |
|---|
| 119 | GB_print_error(); |
|---|
| 120 | continue; |
|---|
| 121 | } |
|---|
| 122 | aprm.data[sp_count ++] = data; |
|---|
| 123 | if (sp_count %50 == 0) printf("Reading taxa %i\n",sp_count); |
|---|
| 124 | { |
|---|
| 125 | int i,size;char c; |
|---|
| 126 | size = GB_read_string_count(gb_source_data); |
|---|
| 127 | for (i=0;i<size;i++) { |
|---|
| 128 | c = hdata[i]; |
|---|
| 129 | if ( (c>='a') && (c<='z')) |
|---|
| 130 | data[i] = c-'a'+'A'; |
|---|
| 131 | else |
|---|
| 132 | data[i] = c; |
|---|
| 133 | } |
|---|
| 134 | for (i=i ; i < aprm.al_len; i++) data[i] = '.'; |
|---|
| 135 | } |
|---|
| 136 | } |
|---|
| 137 | printf("%i taxa read\n",sp_count); |
|---|
| 138 | aprm.sp_count = sp_count; |
|---|
| 139 | if (sp_count ==0) { |
|---|
| 140 | exit(0); |
|---|
| 141 | } |
|---|
| 142 | return 0; |
|---|
| 143 | } |
|---|
| 144 | |
|---|
| 145 | long arb_count_keys(const char */*key*/,long val) |
|---|
| 146 | { |
|---|
| 147 | if (val >1) { |
|---|
| 148 | aprm.key_cnt++; |
|---|
| 149 | }else{ |
|---|
| 150 | aprm.one_key_cnt++; |
|---|
| 151 | } |
|---|
| 152 | return val; |
|---|
| 153 | } |
|---|
| 154 | |
|---|
| 155 | long arb_print_primer(const char *key,long val) |
|---|
| 156 | { |
|---|
| 157 | if (val <=1) return val; |
|---|
| 158 | int gc = 0; |
|---|
| 159 | const char *p; |
|---|
| 160 | for (p = key; *p; p++) { |
|---|
| 161 | if (*p == 'G' || *p== 'C') gc++; |
|---|
| 162 | } |
|---|
| 163 | fprintf(aprm.out," %s matching %4li taxa GC = %3i%%\n", |
|---|
| 164 | key,val,100*gc/(int)strlen(key)); |
|---|
| 165 | return val; |
|---|
| 166 | } |
|---|
| 167 | |
|---|
| 168 | #define is_base(c) ( ((c>='a') && (c<='z')) || ( (c>='A')&&(c<='Z') ) ) |
|---|
| 169 | |
|---|
| 170 | int primer_print(char *dest,char * source,int size) |
|---|
| 171 | { |
|---|
| 172 | char c; |
|---|
| 173 | c = *(source++); |
|---|
| 174 | if (!is_base(c)) return 1; |
|---|
| 175 | while (size){ |
|---|
| 176 | while (!is_base(c)) c=*(source++); |
|---|
| 177 | if ( c == 'N' || c == 'n' ) return 1; |
|---|
| 178 | *(dest++) = c; |
|---|
| 179 | size--; |
|---|
| 180 | if (!c) return 1; |
|---|
| 181 | c = 0; |
|---|
| 182 | } |
|---|
| 183 | *dest = 0; |
|---|
| 184 | return 0; |
|---|
| 185 | } |
|---|
| 186 | |
|---|
| 187 | long arb_reduce_primer_len(const char *key,long val) |
|---|
| 188 | { |
|---|
| 189 | static char buffer[256]; |
|---|
| 190 | int size = strlen(key)-aprm.reduce; |
|---|
| 191 | strncpy(buffer,key,size); |
|---|
| 192 | buffer[size] = 0; |
|---|
| 193 | val += GBS_read_hash(aprm.hash,buffer); |
|---|
| 194 | GBS_write_hash(aprm.hash,buffer,val); |
|---|
| 195 | return val; |
|---|
| 196 | } |
|---|
| 197 | |
|---|
| 198 | void arb_prm_primer(int /*prmanz*/) |
|---|
| 199 | { |
|---|
| 200 | GB_HASH *hash; |
|---|
| 201 | GB_HASH *mhash; |
|---|
| 202 | int sp; |
|---|
| 203 | // int len; |
|---|
| 204 | char *buffer; |
|---|
| 205 | int pos; |
|---|
| 206 | int prmlen; |
|---|
| 207 | int pspecies; |
|---|
| 208 | |
|---|
| 209 | int cutoff_cnt; |
|---|
| 210 | |
|---|
| 211 | int *best_primer_cnt; |
|---|
| 212 | int *best_primer_new; |
|---|
| 213 | int *best_primer_swap; |
|---|
| 214 | // int newlen; |
|---|
| 215 | |
|---|
| 216 | prmlen = aprm.prmlen + ADD_LEN; |
|---|
| 217 | |
|---|
| 218 | buffer = (char *) calloc(sizeof(char), prmlen + 1); |
|---|
| 219 | best_primer_cnt = (int *)calloc(prmlen+1,sizeof(int)); |
|---|
| 220 | best_primer_new = (int *)calloc(prmlen+1,sizeof(int)); |
|---|
| 221 | |
|---|
| 222 | for (pos = 0; pos < aprm.al_len; pos++) { |
|---|
| 223 | prmlen = aprm.prmlen + ADD_LEN; |
|---|
| 224 | mhash = GBS_create_hash(PRM_HASH_SIZE,0); |
|---|
| 225 | pspecies = 0; |
|---|
| 226 | if (pos % 50 == 0) printf("Pos. %i (%i)\n",pos,aprm.al_len); |
|---|
| 227 | cutoff_cnt = aprm.prmanz+1; |
|---|
| 228 | for (sp = 0; sp < aprm.sp_count; sp++) { /* build initial hash table */ |
|---|
| 229 | if (!primer_print(buffer, aprm.data[sp] + pos, prmlen)) { |
|---|
| 230 | GBS_incr_hash(mhash, buffer); |
|---|
| 231 | pspecies++; |
|---|
| 232 | } |
|---|
| 233 | } |
|---|
| 234 | if (pspecies*100 >= aprm.prmsmin * aprm.sp_count ) { /* reduce primer length */ |
|---|
| 235 | for (hash = mhash; prmlen >= aprm.prmlen; prmlen-=aprm.reduce) { |
|---|
| 236 | hash = GBS_create_hash(aprm.prmanz*2,0); |
|---|
| 237 | aprm.hash = hash; |
|---|
| 238 | |
|---|
| 239 | aprm.key_cnt = 0; |
|---|
| 240 | aprm.one_key_cnt = 0; |
|---|
| 241 | GBS_hash_do_loop(mhash, arb_count_keys); |
|---|
| 242 | if ((aprm.key_cnt + aprm.one_key_cnt < cutoff_cnt) && |
|---|
| 243 | // (aprm.key_cnt > aprm.one_key_cnt) && |
|---|
| 244 | (aprm.key_cnt<best_primer_cnt[prmlen+1])){ |
|---|
| 245 | fprintf(aprm.out,"%3i primer found len %3i(of %4i taxa) for position %i\n", aprm.key_cnt, prmlen, pspecies, pos); |
|---|
| 246 | GBS_hash_do_loop(mhash, arb_print_primer); |
|---|
| 247 | fprintf(aprm.out,"\n\n"); |
|---|
| 248 | cutoff_cnt = aprm.key_cnt; |
|---|
| 249 | } |
|---|
| 250 | best_primer_new[prmlen] = aprm.key_cnt; |
|---|
| 251 | aprm.reduce = 1; |
|---|
| 252 | while (aprm.key_cnt > aprm.prmanz*4){ |
|---|
| 253 | aprm.key_cnt/=4; |
|---|
| 254 | aprm.reduce++; |
|---|
| 255 | } |
|---|
| 256 | GBS_hash_do_loop(mhash,arb_reduce_primer_len); |
|---|
| 257 | GBS_free_hash(mhash); |
|---|
| 258 | mhash = hash; |
|---|
| 259 | } |
|---|
| 260 | }else{ |
|---|
| 261 | for (;prmlen>0;prmlen--) best_primer_new[prmlen] = aprm.prmanz+1; |
|---|
| 262 | } |
|---|
| 263 | GBS_free_hash(mhash); |
|---|
| 264 | best_primer_swap = best_primer_new; |
|---|
| 265 | best_primer_new = best_primer_cnt; |
|---|
| 266 | best_primer_cnt = best_primer_swap; |
|---|
| 267 | mhash = 0; |
|---|
| 268 | } |
|---|
| 269 | |
|---|
| 270 | } |
|---|
| 271 | |
|---|
| 272 | int main(int argc, char **/*argv*/) |
|---|
| 273 | { |
|---|
| 274 | char *error; |
|---|
| 275 | const char *path; |
|---|
| 276 | if (argc != 1) { |
|---|
| 277 | fprintf(stderr,"no parameters\n"); |
|---|
| 278 | printf(" Converts RNA or DNA Data to Pro\n"); |
|---|
| 279 | exit(-1); |
|---|
| 280 | } |
|---|
| 281 | path = ":"; |
|---|
| 282 | aprm.gb_main = GB_open(path,"r"); |
|---|
| 283 | if (!aprm.gb_main){ |
|---|
| 284 | fprintf(stderr,"ERROR cannot find server\n"); |
|---|
| 285 | exit(-1); |
|---|
| 286 | } |
|---|
| 287 | GB_begin_transaction(aprm.gb_main); |
|---|
| 288 | aprm.alignment_names = GBT_get_alignment_names(aprm.gb_main); |
|---|
| 289 | GB_commit_transaction(aprm.gb_main); |
|---|
| 290 | arb_prm_menu(); |
|---|
| 291 | GB_begin_transaction(aprm.gb_main); |
|---|
| 292 | error = arb_prm_read(aprm.prmanz); |
|---|
| 293 | if (error) { |
|---|
| 294 | printf("ERROR: %s\n",error); |
|---|
| 295 | exit(0); |
|---|
| 296 | } |
|---|
| 297 | GB_commit_transaction(aprm.gb_main); |
|---|
| 298 | if (strlen(aprm.outname)) { |
|---|
| 299 | aprm.out = fopen(aprm.outname,"w"); |
|---|
| 300 | if (!aprm.out) { |
|---|
| 301 | printf("Cannot open outfile\n"); |
|---|
| 302 | exit (-1); |
|---|
| 303 | } |
|---|
| 304 | arb_prm_primer(aprm.prmanz); |
|---|
| 305 | fclose(aprm.out); |
|---|
| 306 | }else{ |
|---|
| 307 | aprm.out = stdout; |
|---|
| 308 | arb_prm_primer(aprm.prmanz); |
|---|
| 309 | } |
|---|
| 310 | return 0; |
|---|
| 311 | } |
|---|