| 1 | #include <stdio.h> |
|---|
| 2 | #include <stdlib.h> |
|---|
| 3 | #include <string.h> |
|---|
| 4 | #include <malloc.h> |
|---|
| 5 | #include <memory.h> |
|---|
| 6 | |
|---|
| 7 | #include <arbdb.h> |
|---|
| 8 | #include <arbdbt.h> |
|---|
| 9 | |
|---|
| 10 | #include "cat_tree.hxx" |
|---|
| 11 | |
|---|
| 12 | |
|---|
| 13 | const char *CAT_field_names[CAT_FIELD_MAX]; // Fields which are extracted from the arb database |
|---|
| 14 | // syntax: field[:word/(range of words] |
|---|
| 15 | // ex full_name |
|---|
| 16 | |
|---|
| 17 | double CATL_tree_max_deep(GBT_TREE *tree){ |
|---|
| 18 | if (tree->is_leaf) return 0.0; |
|---|
| 19 | double l,r; |
|---|
| 20 | l = CATL_tree_max_deep(tree->leftson); |
|---|
| 21 | r = CATL_tree_max_deep(tree->rightson); |
|---|
| 22 | if (tree->leftlen<0) tree->leftlen = 0; |
|---|
| 23 | if (tree->rightlen<0) tree->rightlen = 0; |
|---|
| 24 | l += tree->leftlen; |
|---|
| 25 | r += tree->rightlen; |
|---|
| 26 | if (l>r) return l; |
|---|
| 27 | return r; |
|---|
| 28 | } |
|---|
| 29 | |
|---|
| 30 | int CATL_size_of_tree(GBT_TREE *tree){ |
|---|
| 31 | if (tree->is_leaf) return 1; |
|---|
| 32 | return CATL_size_of_tree(tree->leftson) + |
|---|
| 33 | CATL_size_of_tree(tree->rightson) +1; |
|---|
| 34 | } |
|---|
| 35 | |
|---|
| 36 | |
|---|
| 37 | CAT_tree *new_CAT_tree(int size) |
|---|
| 38 | { |
|---|
| 39 | CAT_tree *THIS = (CAT_tree *)calloc(sizeof(CAT_tree) + sizeof(CAT_node) * (size-1),1); |
|---|
| 40 | |
|---|
| 41 | CAT_field_names[CAT_FIELD_NAME] = "name"; |
|---|
| 42 | CAT_field_names[CAT_FIELD_FULL_NAME] = "full_name"; |
|---|
| 43 | CAT_field_names[CAT_FIELD_GROUP_NAME] = "group_name"; |
|---|
| 44 | CAT_field_names[CAT_FIELD_ACC] = "acc"; |
|---|
| 45 | CAT_field_names[CAT_FIELD_STRAIN] = "strain"; |
|---|
| 46 | return THIS; |
|---|
| 47 | } |
|---|
| 48 | |
|---|
| 49 | |
|---|
| 50 | void *cat_mem_files[CAT_FIELD_MAX]; |
|---|
| 51 | |
|---|
| 52 | |
|---|
| 53 | GB_ERROR cat_write_cat_tree(CAT_tree *cat_tree, FILE *out){ |
|---|
| 54 | char *cat_strings[CAT_FIELD_MAX]; |
|---|
| 55 | long cat_offsets[CAT_FIELD_MAX]; |
|---|
| 56 | long cat_sizes[CAT_FIELD_MAX]; |
|---|
| 57 | int i,j,node; |
|---|
| 58 | for (i=0;i<CAT_FIELD_MAX;i++) cat_strings[i] = GBS_strclose(cat_mem_files[i],0); |
|---|
| 59 | long sizeoftree = sizeof(CAT_tree) + sizeof(CAT_node) * (cat_tree->nnodes-1); |
|---|
| 60 | long offset = sizeoftree; |
|---|
| 61 | for (i=0;i<CAT_FIELD_MAX;i++){ |
|---|
| 62 | int len = strlen(cat_strings[i])+1; |
|---|
| 63 | cat_sizes[i] = len; |
|---|
| 64 | // change '^A' seperated strings to \0 sep. strings |
|---|
| 65 | for (j=0;j<len;j++) if (cat_strings[i][j] == 1) cat_strings[i][j] = 0; |
|---|
| 66 | cat_offsets[i] = offset; |
|---|
| 67 | offset += len; |
|---|
| 68 | } |
|---|
| 69 | |
|---|
| 70 | for (node = 0; node < cat_tree->nnodes; node++) { |
|---|
| 71 | for (i=0;i<CAT_FIELD_MAX;i++) { |
|---|
| 72 | if ( cat_tree->nodes[node].field_offsets[i] == 0) continue; |
|---|
| 73 | // no field |
|---|
| 74 | cat_tree->nodes[node].field_offsets[i] += cat_offsets[i]; |
|---|
| 75 | } |
|---|
| 76 | } |
|---|
| 77 | |
|---|
| 78 | if ( !fwrite( (char *)cat_tree,(int)(sizeoftree), 1, out) ) { |
|---|
| 79 | return GB_export_error("Disk Full"); |
|---|
| 80 | } |
|---|
| 81 | |
|---|
| 82 | |
|---|
| 83 | for (i=0;i<CAT_FIELD_MAX;i++){ |
|---|
| 84 | if (!fwrite( cat_strings[i], (int)cat_sizes[i], 1, out) ) { |
|---|
| 85 | return GB_export_error("Disk Full"); |
|---|
| 86 | } |
|---|
| 87 | } |
|---|
| 88 | return 0; |
|---|
| 89 | } |
|---|
| 90 | |
|---|
| 91 | long cat_convert_tree_2_cat_rek(GBT_TREE *tree, CAT_tree *cat_tree, int deep, double scale, |
|---|
| 92 | int *level_counter, int& leaf_counter, int& node_counter){ |
|---|
| 93 | int nodec = node_counter++; |
|---|
| 94 | CAT_node *node = & cat_tree->nodes[nodec]; |
|---|
| 95 | node->deep = deep; |
|---|
| 96 | if (tree->is_leaf) { |
|---|
| 97 | node->numbering[CAT_NUMBERING_LEAFS_ONLY] = leaf_counter++; |
|---|
| 98 | node->numbering[CAT_NUMBERING_LEVELS] = node->numbering[CAT_NUMBERING_LEAFS_ONLY] + cat_tree->nnodes/2; |
|---|
| 99 | }else{ |
|---|
| 100 | node->numbering[CAT_NUMBERING_LEVELS] = level_counter[deep]; |
|---|
| 101 | level_counter[deep]++; |
|---|
| 102 | |
|---|
| 103 | node->leftson = cat_convert_tree_2_cat_rek(tree->leftson,cat_tree,deep+1,scale, |
|---|
| 104 | level_counter,leaf_counter,node_counter); |
|---|
| 105 | node->rightson = cat_convert_tree_2_cat_rek(tree->rightson,cat_tree,deep+1,scale, |
|---|
| 106 | level_counter,leaf_counter,node_counter); |
|---|
| 107 | cat_tree->nodes[node->leftson].father = nodec; |
|---|
| 108 | cat_tree->nodes[node->rightson].father = nodec; |
|---|
| 109 | |
|---|
| 110 | double l = tree->leftlen; |
|---|
| 111 | double r = tree->rightlen; |
|---|
| 112 | |
|---|
| 113 | if (l<0) l = 0.0; |
|---|
| 114 | if (r<0) r = 0.0; |
|---|
| 115 | |
|---|
| 116 | cat_tree->nodes[node->leftson].branch_length_float = l; |
|---|
| 117 | cat_tree->nodes[node->rightson].branch_length_float = r; |
|---|
| 118 | |
|---|
| 119 | l *= scale; |
|---|
| 120 | r *= scale; |
|---|
| 121 | if (l>0 && l<1) l = 1; |
|---|
| 122 | if (r>0 && r<1) r = 1; |
|---|
| 123 | |
|---|
| 124 | if (l> 255) l = 255; |
|---|
| 125 | if (r > 255) l = 255; |
|---|
| 126 | |
|---|
| 127 | cat_tree->nodes[node->leftson].branch_length_byte = (unsigned char)l; |
|---|
| 128 | cat_tree->nodes[node->rightson].branch_length_byte = (unsigned char)r; |
|---|
| 129 | } |
|---|
| 130 | if (tree->gb_node) { |
|---|
| 131 | int i; |
|---|
| 132 | const char *f; |
|---|
| 133 | GBDATA *gbd; |
|---|
| 134 | int start = 0; |
|---|
| 135 | int end = 1000000; |
|---|
| 136 | for (i=0;i<CAT_FIELD_MAX;i++) { |
|---|
| 137 | start = 0; |
|---|
| 138 | end = 1000000; |
|---|
| 139 | f = CAT_field_names[i]; |
|---|
| 140 | gbd = GB_find(tree->gb_node,f,0,down_level); |
|---|
| 141 | if (gbd){ |
|---|
| 142 | char * s= GB_read_as_string(gbd); |
|---|
| 143 | if (s && strlen(s)){ // append text to output stream |
|---|
| 144 | node->field_offsets[i] = GBS_memoffset(cat_mem_files[i]); |
|---|
| 145 | GBS_strcat(cat_mem_files[i],s); |
|---|
| 146 | GBS_chrcat(cat_mem_files[i],1); // seperated by '^A' |
|---|
| 147 | delete s; |
|---|
| 148 | } |
|---|
| 149 | } |
|---|
| 150 | } |
|---|
| 151 | gbd = GB_find(tree->gb_node,"grouped",0,down_level); |
|---|
| 152 | if (gbd) { |
|---|
| 153 | node->is_grouped_in = GB_read_byte(gbd); |
|---|
| 154 | } |
|---|
| 155 | node->color_in = T2J_COLOR_NO - GB_read_flag(tree->gb_node); |
|---|
| 156 | }else{ |
|---|
| 157 | if (tree->name){ |
|---|
| 158 | if (tree->is_leaf){ |
|---|
| 159 | char *name = strdup(tree->name); |
|---|
| 160 | char *fullname = strchr(name,'#'); |
|---|
| 161 | if (fullname){ |
|---|
| 162 | *(fullname++) = 0; |
|---|
| 163 | }else{ |
|---|
| 164 | fullname = name; |
|---|
| 165 | } |
|---|
| 166 | |
|---|
| 167 | node->field_offsets[CAT_FIELD_NAME] = GBS_memoffset(cat_mem_files[CAT_FIELD_NAME]); |
|---|
| 168 | GBS_strcat(cat_mem_files[CAT_FIELD_NAME],name); |
|---|
| 169 | GBS_chrcat(cat_mem_files[CAT_FIELD_NAME],1); // seperated by '^A' |
|---|
| 170 | |
|---|
| 171 | node->field_offsets[CAT_FIELD_FULL_NAME] = GBS_memoffset(cat_mem_files[CAT_FIELD_FULL_NAME]); |
|---|
| 172 | GBS_strcat(cat_mem_files[CAT_FIELD_FULL_NAME],fullname); |
|---|
| 173 | GBS_chrcat(cat_mem_files[CAT_FIELD_FULL_NAME],1);// seperated by '^A' |
|---|
| 174 | delete name; |
|---|
| 175 | }else{ |
|---|
| 176 | node->field_offsets[CAT_FIELD_GROUP_NAME] = GBS_memoffset(cat_mem_files[CAT_FIELD_GROUP_NAME]); |
|---|
| 177 | GBS_strcat(cat_mem_files[CAT_FIELD_GROUP_NAME],tree->name); |
|---|
| 178 | GBS_chrcat(cat_mem_files[CAT_FIELD_GROUP_NAME],1); // seperated by '^A' |
|---|
| 179 | } |
|---|
| 180 | }else{ |
|---|
| 181 | node->color_in = T2J_COLOR_UNKNOWN; |
|---|
| 182 | } |
|---|
| 183 | } |
|---|
| 184 | return nodec; |
|---|
| 185 | } |
|---|
| 186 | |
|---|
| 187 | /** calculates the leafes for all inner nodes */ |
|---|
| 188 | long calcnleafs(CAT_tree *cat_tree, int nodenr){ |
|---|
| 189 | if (cat_tree->nodes[nodenr].leftson) { |
|---|
| 190 | int sum = calcnleafs(cat_tree, cat_tree->nodes[nodenr].leftson) + |
|---|
| 191 | calcnleafs(cat_tree, cat_tree->nodes[nodenr].rightson); |
|---|
| 192 | cat_tree->nodes[nodenr].nleafs = sum; |
|---|
| 193 | return sum; |
|---|
| 194 | } |
|---|
| 195 | cat_tree->nodes[nodenr].nleafs = 1; |
|---|
| 196 | return 1; |
|---|
| 197 | } |
|---|
| 198 | |
|---|
| 199 | CAT_tree *convert_GBT_tree_2_CAT_tree(GBT_TREE *tree) { |
|---|
| 200 | CAT_tree *cat_tree; |
|---|
| 201 | int size_of_tree = CATL_size_of_tree(tree); |
|---|
| 202 | cat_tree = new_CAT_tree(size_of_tree); |
|---|
| 203 | cat_tree->nnodes = size_of_tree; |
|---|
| 204 | int leaf_counter = 0; |
|---|
| 205 | int node_counter = 0; |
|---|
| 206 | int *level_counter = (int *)calloc(sizeof(int),size_of_tree); |
|---|
| 207 | |
|---|
| 208 | double scale = 255/CATL_tree_max_deep(tree); |
|---|
| 209 | cat_convert_tree_2_cat_rek(tree, cat_tree, 0,scale, |
|---|
| 210 | level_counter, leaf_counter, node_counter); |
|---|
| 211 | int i; |
|---|
| 212 | for (i = 1; i < size_of_tree; i++) level_counter[i] += level_counter[i-1]; |
|---|
| 213 | for (i = 0; i < size_of_tree; i++) { |
|---|
| 214 | CAT_node *node = & cat_tree->nodes[i]; |
|---|
| 215 | if (node->deep && node->leftson){ |
|---|
| 216 | node->numbering[CAT_NUMBERING_LEVELS] += level_counter[node->deep-1]; |
|---|
| 217 | } |
|---|
| 218 | } |
|---|
| 219 | calcnleafs(cat_tree,0); |
|---|
| 220 | |
|---|
| 221 | delete level_counter; |
|---|
| 222 | return cat_tree; |
|---|
| 223 | } |
|---|
| 224 | |
|---|
| 225 | void cat_debug_tree(CAT_tree *cat_tree){ |
|---|
| 226 | int i; |
|---|
| 227 | for (i=0;i< cat_tree->nnodes; i++){ |
|---|
| 228 | CAT_node *node = & cat_tree->nodes[i]; |
|---|
| 229 | printf("%4i f%4li l%4li r%4li lfn%4li lvl%4li d%4li bl%4i ", |
|---|
| 230 | i, |
|---|
| 231 | node->father, |
|---|
| 232 | node->leftson, |
|---|
| 233 | node->rightson, |
|---|
| 234 | node->numbering[CAT_NUMBERING_LEAFS_ONLY], |
|---|
| 235 | node->numbering[CAT_NUMBERING_LEVELS], |
|---|
| 236 | node->deep, |
|---|
| 237 | (int)node->branch_length_byte); |
|---|
| 238 | int j; |
|---|
| 239 | for (j=0;j<CAT_FIELD_MAX;j++){ |
|---|
| 240 | if (node->field_offsets[j]) { |
|---|
| 241 | printf("%15s ",cat_tree->data+node->field_offsets[j]); |
|---|
| 242 | } |
|---|
| 243 | } |
|---|
| 244 | printf("\n"); |
|---|
| 245 | } |
|---|
| 246 | } |
|---|
| 247 | |
|---|
| 248 | GB_ERROR create_and_save_CAT_tree(GBT_TREE *tree, const char *path){ |
|---|
| 249 | GB_ERROR error = 0; |
|---|
| 250 | int i; |
|---|
| 251 | for (i = 0; i < CAT_FIELD_MAX; i++){ |
|---|
| 252 | cat_mem_files[i] = GBS_stropen(CAT_INIT_STRING_FILE_SIZE); |
|---|
| 253 | GBS_chrcat(cat_mem_files[i],1); // write 1 byte so that all further |
|---|
| 254 | // entries get a non zero offset |
|---|
| 255 | } |
|---|
| 256 | CAT_tree *cat_tree = convert_GBT_tree_2_CAT_tree(tree); |
|---|
| 257 | if (!cat_tree) return GB_get_error(); |
|---|
| 258 | FILE *out = fopen(path,"w"); |
|---|
| 259 | if (!out) return GB_export_error("Sorry cannot write to file '%s'",path); |
|---|
| 260 | error = cat_write_cat_tree(cat_tree,out); |
|---|
| 261 | fclose(out); |
|---|
| 262 | return error; |
|---|
| 263 | |
|---|
| 264 | } |
|---|
| 265 | |
|---|
| 266 | GB_ERROR create_and_save_CAT_tree(GBDATA *gb_main, const char *tree_name, const char *path){ |
|---|
| 267 | GB_transaction dummy(gb_main); |
|---|
| 268 | |
|---|
| 269 | GBT_TREE *tree = GBT_read_tree(gb_main,tree_name, sizeof(GBT_TREE) ); |
|---|
| 270 | if (!tree) return GB_get_error(); |
|---|
| 271 | GB_ERROR error = GBT_link_tree(tree,gb_main,GB_FALSE); |
|---|
| 272 | if (error) return error; |
|---|
| 273 | error = create_and_save_CAT_tree(tree,path); |
|---|
| 274 | return error; |
|---|
| 275 | } |
|---|