1 | // Copyright (c) 2004 - 2005 Kai Bader <baderk@in.tum.de> |
---|
2 | // |
---|
3 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
---|
4 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
---|
5 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
---|
6 | // AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN |
---|
7 | // AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN |
---|
8 | // CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. |
---|
9 | |
---|
10 | // CVS REVISION TAG -- $Revision: 5825 $ |
---|
11 | |
---|
12 | |
---|
13 | /**************************************************************************** |
---|
14 | * TODO |
---|
15 | * |
---|
16 | * - THE CSV-IMPORT SHOULD DIFFER BETWEEN THE DATA TYPES |
---|
17 | * - CR/LF MUST BE REMOVED FROM THE LAST COLUMN |
---|
18 | ****************************************************************************/ |
---|
19 | |
---|
20 | |
---|
21 | #include <cstdio> |
---|
22 | #include <cstdlib> |
---|
23 | #include <iostream> |
---|
24 | #include <fstream> |
---|
25 | // |
---|
26 | #include <sys/types.h> |
---|
27 | #include <dirent.h> |
---|
28 | |
---|
29 | using namespace std; |
---|
30 | |
---|
31 | #include "file_import.hxx" |
---|
32 | |
---|
33 | |
---|
34 | /**************************************************************************** |
---|
35 | * IMPORT FUNCTION FOR A CSV DATA FILE (OPENS THE FILE AND RETURNS THE DATA) |
---|
36 | ****************************************************************************/ |
---|
37 | importTable *fileopenCSV(char *filename, int delimiter) |
---|
38 | { |
---|
39 | // DEFINE VARIABLES |
---|
40 | int inconsistencies= 0; |
---|
41 | int rows= 0; |
---|
42 | int total_columns= 0; |
---|
43 | int local_columns= 0; |
---|
44 | int cell_index; |
---|
45 | int cell_size; |
---|
46 | char **cells; |
---|
47 | char *cell; |
---|
48 | char *start_ptr; |
---|
49 | char *end_ptr; |
---|
50 | |
---|
51 | // ALLOCATE MEM FOR IMPORTTABLE STRUCTURE |
---|
52 | importTable *table= NULL; |
---|
53 | |
---|
54 | // // ALLOCATE MEM FOR IMPORTTABLE STRUCTURE |
---|
55 | // importTable *table= (importTable *)malloc(sizeof(importTable)); |
---|
56 | // if(!table) return NULL; |
---|
57 | |
---|
58 | // ALLOCATE MEMORY FOR READ BUFFER |
---|
59 | char *buffer= (char *)malloc(sizeof(char) * 10241); |
---|
60 | buffer[10240]=0; |
---|
61 | |
---|
62 | // DEFINE INPUT STREAM AND OPEN FILE |
---|
63 | ifstream iS; |
---|
64 | iS.open(filename); |
---|
65 | |
---|
66 | if(iS) |
---|
67 | { |
---|
68 | // PASS 1: GET CSV TABLE SIZE AND FIND INCONSISTENCIES |
---|
69 | |
---|
70 | while(iS.getline(buffer, 10240)) |
---|
71 | { |
---|
72 | rows++; |
---|
73 | local_columns= 0; |
---|
74 | start_ptr= buffer; |
---|
75 | |
---|
76 | if(*start_ptr) |
---|
77 | { |
---|
78 | local_columns++; |
---|
79 | } |
---|
80 | |
---|
81 | while(*start_ptr && (end_ptr= strchr(start_ptr, delimiter))) |
---|
82 | { |
---|
83 | local_columns++; |
---|
84 | start_ptr= end_ptr + 1; |
---|
85 | } |
---|
86 | |
---|
87 | if((local_columns != total_columns) && (rows > 1)) |
---|
88 | { |
---|
89 | inconsistencies++; |
---|
90 | |
---|
91 | printf("CSV-IMPORTER: found inconsistent data in row %d (%d instead of %d cells).\n", |
---|
92 | rows, local_columns, total_columns); |
---|
93 | } |
---|
94 | else |
---|
95 | { |
---|
96 | total_columns= local_columns; |
---|
97 | } |
---|
98 | |
---|
99 | } |
---|
100 | |
---|
101 | // CHECK IF INCONSISTENCIES HAVE OCCURED AND ABORT IF NECESSARY |
---|
102 | if(inconsistencies) |
---|
103 | { |
---|
104 | iS.close(); |
---|
105 | free(table); |
---|
106 | free(buffer); |
---|
107 | printf("CSV-IMPORTER: aborted due to inconsistent data (%d violations).\n", inconsistencies); |
---|
108 | return NULL; |
---|
109 | } |
---|
110 | |
---|
111 | // PASS 2: READ CSV-DATA FROM FILE INTO THE MEM-ARRAY |
---|
112 | |
---|
113 | // RESET FILE INPUT STREAM |
---|
114 | iS.clear(); |
---|
115 | iS.seekg(0, ios::beg); |
---|
116 | |
---|
117 | |
---|
118 | |
---|
119 | // ALLOCATE MEMORY FOR THE TABLE DATA |
---|
120 | table= createImportTable(rows, total_columns); |
---|
121 | if(!table) return NULL; |
---|
122 | |
---|
123 | // cells= (char **)malloc(rows * total_columns * sizeof(char *)); |
---|
124 | // if(!cells) |
---|
125 | // { |
---|
126 | // iS.close(); |
---|
127 | // free(table); |
---|
128 | // free(buffer); |
---|
129 | // printf("CSV-IMPORTER: unable to allocate memory for the table data.\n"); |
---|
130 | // return NULL; |
---|
131 | // } |
---|
132 | |
---|
133 | cells= table->cell; |
---|
134 | // table->rows= rows; |
---|
135 | // table->columns= total_columns; |
---|
136 | |
---|
137 | cell_index= 0; |
---|
138 | |
---|
139 | while(iS.getline(buffer, 10240)) |
---|
140 | { |
---|
141 | start_ptr= buffer; |
---|
142 | |
---|
143 | while(*start_ptr && (end_ptr= strchr(start_ptr, delimiter))) |
---|
144 | { |
---|
145 | // REMOVE QUOTATION MARKS, IF NECESSARY |
---|
146 | if(*start_ptr == '"' && *(end_ptr-1) == '"' && (start_ptr < (end_ptr-1))) |
---|
147 | { |
---|
148 | start_ptr++; |
---|
149 | *(end_ptr-1)= 0; |
---|
150 | } |
---|
151 | |
---|
152 | cell_size= (int)(end_ptr - start_ptr) + 1; |
---|
153 | cell= (char *)malloc(cell_size * sizeof(char)); |
---|
154 | |
---|
155 | *end_ptr= 0; |
---|
156 | strncpy(cell, start_ptr, cell_size); |
---|
157 | |
---|
158 | cells[cell_index]= cell; |
---|
159 | cell_index++; |
---|
160 | |
---|
161 | start_ptr= end_ptr + 1; |
---|
162 | } |
---|
163 | // if(*start_ptr) |
---|
164 | //{ |
---|
165 | end_ptr= start_ptr; |
---|
166 | while((*end_ptr) && (*end_ptr != 0x0D) && (*end_ptr != 0x0A)) end_ptr++; |
---|
167 | |
---|
168 | cell_size= (int)(end_ptr - start_ptr) + 1; |
---|
169 | cell= (char *)malloc(cell_size * sizeof(char)); |
---|
170 | |
---|
171 | *end_ptr= 0; |
---|
172 | strncpy(cell, start_ptr, cell_size); |
---|
173 | |
---|
174 | cells[cell_index]= cell; |
---|
175 | cell_index++; |
---|
176 | //} |
---|
177 | } |
---|
178 | } |
---|
179 | else |
---|
180 | { |
---|
181 | // SOMETHING WENT WRONG. FREE ALLOCATED MEM AND RETURN |
---|
182 | free(table); |
---|
183 | free(buffer); |
---|
184 | printf("CSV-IMPORTER: unable to open file: %s\n", filename); |
---|
185 | return NULL; |
---|
186 | } |
---|
187 | |
---|
188 | iS.close(); |
---|
189 | free(buffer); |
---|
190 | |
---|
191 | // IDENTIFY THE COLUMNS BEST FITTING DATATYPES |
---|
192 | identifyColumns(table); |
---|
193 | |
---|
194 | printf("CSV-IMPORTER: successfully imported file: %s (rows: %d, columns: %d)\n", filename, rows, total_columns); |
---|
195 | return table; |
---|
196 | } |
---|
197 | |
---|
198 | |
---|
199 | /**************************************************************************** |
---|
200 | * IMPORT FUNCTION FOR CSV DATA & HEADER (IMPORTS DIRECTLY INTO ARB) |
---|
201 | ****************************************************************************/ |
---|
202 | int importCSV(importTable *table, importData *data) |
---|
203 | { |
---|
204 | // FETCH ARB DATABASE HANDLE |
---|
205 | GBDATA *gb_main= get_gbData(); |
---|
206 | GBDATA *gb_experiment; |
---|
207 | GBDATA *gb_proteom, *gb_proteom_data; |
---|
208 | GBDATA *gb_protein, *gb_protein_data; |
---|
209 | char *head, *content; |
---|
210 | int rows= table->rows; |
---|
211 | int columns= table->columns; |
---|
212 | |
---|
213 | // GBDATA *gb_main= get_gbData(); |
---|
214 | // GBDATA *gb_prot, *gb_exp, *gb_prot_data, *gb_prot_name; |
---|
215 | // GBDATA *gb_protein, *gb_protein_entry; |
---|
216 | // char *head, *content; |
---|
217 | // int rows= table->rows; |
---|
218 | // int columns= table->columns; |
---|
219 | // CHECK IF AN ARB CONNECTION IS GIVEN |
---|
220 | if(!gb_main) |
---|
221 | { |
---|
222 | printf("CSV import failed - no ARB connection."); |
---|
223 | return -1; |
---|
224 | } |
---|
225 | |
---|
226 | // FIND EXPERIMENT ENTRY |
---|
227 | gb_experiment= find_experiment(data->species, data->experiment); |
---|
228 | |
---|
229 | // IF SO, EXIT THE FILE IMPORT (EVERYTHING OTHER WOULD LEAD TO INCONSISTENT DATA) |
---|
230 | if(!gb_experiment) |
---|
231 | { |
---|
232 | printf("CSV import failed - the given species or experiment name could not be resolved."); |
---|
233 | return -2; |
---|
234 | } |
---|
235 | |
---|
236 | // IS THERE ALREADY A PROTEOME WITH THE NEW FILENAME? |
---|
237 | gb_proteom= find_proteome(gb_experiment, data->proteome); |
---|
238 | |
---|
239 | // IF SO, EXIT THE FILE IMPORT (EVERYTHING OTHER WOULD LEAD TO INCONSISTENT DATA) |
---|
240 | if(gb_proteom) |
---|
241 | { |
---|
242 | printf("CSV import failed - the given proteome name already exists (must be unique)."); |
---|
243 | return -3; |
---|
244 | } |
---|
245 | |
---|
246 | // BEGIN ARB TRANSACTION |
---|
247 | ARB_begin_transaction(); |
---|
248 | |
---|
249 | // ENTER EXPERIMENT DATA ENTRY |
---|
250 | // IF THERE IS NO PROETOME_DATA ENTRY, CREATE A NEW ONE |
---|
251 | gb_proteom_data = GB_search(gb_experiment, "proteome_data", GB_CREATE_CONTAINER); |
---|
252 | pgt_assert(gb_proteom_data); // @@@ error handling is missing |
---|
253 | |
---|
254 | // CREATE NEW PROTEOME ENTRY |
---|
255 | gb_proteom = GB_create_container(gb_proteom_data, "proteome"); |
---|
256 | pgt_assert(gb_proteom); |
---|
257 | |
---|
258 | // ADD THE NAME TO THE NEW PROTEOME ENTRY |
---|
259 | GB_ERROR error = GBT_write_string(gb_proteom, "name", data->proteome); |
---|
260 | pgt_assert(!error); |
---|
261 | |
---|
262 | // CREATE PROTEINE DATA ENTRY |
---|
263 | gb_protein_data = GB_create_container(gb_proteom, "proteine_data"); |
---|
264 | pgt_assert(gb_protein_data); |
---|
265 | |
---|
266 | // IMPORT CELL DATA |
---|
267 | for(int r= 1; r < rows; r++) |
---|
268 | { |
---|
269 | // EACH ROW REPRESENTS A PROTEIN -> NEW CONTAINER |
---|
270 | gb_protein = GB_create_container(gb_protein_data, "protein"); |
---|
271 | pgt_assert(gb_protein); |
---|
272 | |
---|
273 | // TRAVERSE COLUMNS FOR EACH ROW AND CREATE ENTRIES |
---|
274 | for(int c= 0; c < columns; c++) |
---|
275 | { |
---|
276 | // CLEAN HEADER FOR USAGE AS ARB KEY |
---|
277 | head= GBS_string_2_key(table->header[c]); |
---|
278 | |
---|
279 | // FETCH CONTENT FOR THE NEW ENTRY |
---|
280 | content= table->cell[(r * columns) + c]; |
---|
281 | |
---|
282 | // CHECK IF WE HAVE AN ENTRY |
---|
283 | if(content) |
---|
284 | { |
---|
285 | // CHECK THE CORRECT COLUMN TYPE |
---|
286 | if(table->hasTypes) |
---|
287 | { |
---|
288 | switch(table->columnType[c]) |
---|
289 | { |
---|
290 | case DATATYPE_INT: error = GBT_write_int(gb_protein, head, atol(content)); break; |
---|
291 | case DATATYPE_FLOAT: error = GBT_write_float(gb_protein, head, atof(content)); break; |
---|
292 | default: error = GBT_write_string(gb_protein, head, content); break; |
---|
293 | } |
---|
294 | } |
---|
295 | else |
---|
296 | { |
---|
297 | error = GBT_write_string(gb_protein, head, content); |
---|
298 | } |
---|
299 | pgt_assert(!error); |
---|
300 | } |
---|
301 | } |
---|
302 | } |
---|
303 | |
---|
304 | // END ARB TRANSACTION |
---|
305 | ARB_commit_transaction(); |
---|
306 | |
---|
307 | return 0; |
---|
308 | } |
---|
309 | |
---|
310 | |
---|
311 | /**************************************************************************** |
---|
312 | * CREATE AN IMPORT TABLE |
---|
313 | ****************************************************************************/ |
---|
314 | importTable *createImportTable(int rows, int columns) |
---|
315 | { |
---|
316 | // ALLOCATE MEMORY FOR THE TABLE DATA |
---|
317 | importTable *table= (importTable *)malloc(sizeof(importTable)); |
---|
318 | if(!table) return NULL; |
---|
319 | |
---|
320 | // ALLOCATE MEMORY FOR THE CELL DATA |
---|
321 | char **cells= (char **)malloc(rows * columns * sizeof(char *)); |
---|
322 | if(!cells) return NULL; |
---|
323 | |
---|
324 | // ALLOCATE MEMORY FOR THE HEADER DATA |
---|
325 | char **header= (char **)malloc(columns * sizeof(char *)); |
---|
326 | if(!header) return NULL; |
---|
327 | |
---|
328 | // INIT ALL CELL VALUES WITH NULL |
---|
329 | int i; |
---|
330 | for(i= 0; i < (rows * columns); i++) cells[i]= NULL; |
---|
331 | for(i= 0; i < columns; i++) header[i]= NULL; |
---|
332 | |
---|
333 | // ALLOCATE MEMORY FOR THE COLUMN TYPE DATA |
---|
334 | int *columnType= (int *)malloc(columns * sizeof(int)); |
---|
335 | if(!columnType) return NULL; |
---|
336 | |
---|
337 | // ENTER VALID PREDEFINED VALUES (SHOULD BE CHANGED LATER) |
---|
338 | table->rows= rows; |
---|
339 | table->columns= columns; |
---|
340 | table->cell= cells; |
---|
341 | table->header= header; |
---|
342 | table->hasHeader= false; |
---|
343 | table->columnType= columnType; |
---|
344 | |
---|
345 | // RETURN POINTER TO TABLE |
---|
346 | return table; |
---|
347 | } |
---|
348 | |
---|
349 | |
---|
350 | /**************************************************************************** |
---|
351 | * FIND XSLT FILES (*.XSL) AS IMPORT FILTERS |
---|
352 | ****************************************************************************/ |
---|
353 | XSLTimporter *findXSLTFiles(char *path) |
---|
354 | { |
---|
355 | DIR *dir; |
---|
356 | struct dirent *dir_entry; |
---|
357 | int count= 0; |
---|
358 | char *name; |
---|
359 | |
---|
360 | // *** FIRST RUN, COUNT ENTRIES... |
---|
361 | |
---|
362 | // EXIT, IF THE GIVEN PATH WAS INCORRECT |
---|
363 | if((dir= opendir(path)) == NULL) return NULL; |
---|
364 | |
---|
365 | // TRAVERSE ALL FILES... |
---|
366 | while((dir_entry= readdir(dir)) != NULL) |
---|
367 | if(strstr(dir_entry->d_name, ".xsl") || strstr(dir_entry->d_name, ".XSL")) |
---|
368 | count++; |
---|
369 | |
---|
370 | // CLOSE DIRECTORY HANDLE |
---|
371 | closedir(dir); |
---|
372 | |
---|
373 | // *** SECOND RUN, ALLOCATE SPACE AND READ ALL NAMES |
---|
374 | |
---|
375 | // CREATE NEW STRUCT |
---|
376 | XSLTimporter *xslt= (XSLTimporter *)malloc(sizeof(XSLTimporter)); |
---|
377 | |
---|
378 | // CREATE PATH ENTRY |
---|
379 | xslt->path= (char *)malloc(sizeof(char) * (strlen(path) + 1)); |
---|
380 | strcpy(xslt->path, path); |
---|
381 | |
---|
382 | // ALLOCATE MEM FOR NAME ARRAY POINTER |
---|
383 | xslt->importer= (char **)malloc(sizeof(char *) * count); |
---|
384 | |
---|
385 | // EXIT, IF THE GIVEN PATH WAS INCORRECT |
---|
386 | if((dir= opendir(path)) == NULL) |
---|
387 | { |
---|
388 | free(xslt->importer); |
---|
389 | free(xslt->path); |
---|
390 | free(xslt); |
---|
391 | |
---|
392 | return NULL; |
---|
393 | } |
---|
394 | |
---|
395 | // TRAVERSE ALL FILES... |
---|
396 | count= 0; |
---|
397 | while((dir_entry= readdir(dir)) != NULL) |
---|
398 | { |
---|
399 | if(strstr(dir_entry->d_name, ".xsl") || strstr(dir_entry->d_name, ".XSL")) |
---|
400 | { |
---|
401 | name= (char *)malloc(sizeof(char) * (strlen(dir_entry->d_name) + 1)); |
---|
402 | strcpy(name, dir_entry->d_name); |
---|
403 | |
---|
404 | xslt->importer[count]= name; |
---|
405 | count++; |
---|
406 | } |
---|
407 | } |
---|
408 | |
---|
409 | xslt->number= count; |
---|
410 | |
---|
411 | return xslt; |
---|
412 | } |
---|
413 | |
---|
414 | |
---|
415 | /**************************************************************************** |
---|
416 | * IDENTIFY ENTRY TYPE |
---|
417 | * THIS FUNCTION TRIES TO IDENTIFY THE TYPE OF AN ENTRY (STRING, NUMBER) |
---|
418 | ****************************************************************************/ |
---|
419 | int identifyType(char *entry) |
---|
420 | { |
---|
421 | bool has_dot= false; |
---|
422 | bool has_numeric= true; |
---|
423 | char *ptr= entry; |
---|
424 | |
---|
425 | if(!ptr || (*ptr == 0)) return DATATYPE_UNKNOWN; |
---|
426 | |
---|
427 | while(*ptr) |
---|
428 | { |
---|
429 | if((*ptr == '.') || (*ptr == ',')) has_dot= true; |
---|
430 | else if((*ptr < '0') || (*ptr > '9')) has_numeric= false; |
---|
431 | |
---|
432 | ptr++; |
---|
433 | } |
---|
434 | |
---|
435 | if(has_dot && has_numeric) return DATATYPE_FLOAT; |
---|
436 | else if(has_numeric) return DATATYPE_INT; |
---|
437 | |
---|
438 | return DATATYPE_STRING; |
---|
439 | } |
---|
440 | |
---|
441 | |
---|
442 | /**************************************************************************** |
---|
443 | * TRY TO IDENTIFY THE COLUMN TYPE USING THEIR ENTRIES |
---|
444 | ****************************************************************************/ |
---|
445 | void identifyColumns(importTable *table) |
---|
446 | { |
---|
447 | // FUNCTION VARIABLES |
---|
448 | int rows= table->rows; |
---|
449 | int columns= table->columns; |
---|
450 | char **cell= table->cell; |
---|
451 | int *columnType= table->columnType; |
---|
452 | int c, r, colType, cellType; |
---|
453 | |
---|
454 | // TRAVERSE EVERY COLUMN |
---|
455 | for(c= 0; c < columns; c++) |
---|
456 | { |
---|
457 | colType= DATATYPE_UNKNOWN; |
---|
458 | |
---|
459 | // VIEW ALL ENTRIES AND IDENTIFY THE BEST FITTING TYPE |
---|
460 | for(r= 1; r < rows; r++) |
---|
461 | { |
---|
462 | // GET THE CELLS DATATYPE |
---|
463 | cellType= identifyType(cell[(r * columns) + c]); |
---|
464 | |
---|
465 | // CHANGE COLUMN TYPE IF A HIGHER DATATYPE IS FOUND |
---|
466 | if(cellType > colType) colType= cellType; |
---|
467 | |
---|
468 | // // DEBUG DEBUG DEBUG |
---|
469 | // if(cellType > colType) |
---|
470 | // { |
---|
471 | // printf("COLUMN %d: ENTRY \'%s\' SWITCHED FROM %d TO %d\n", |
---|
472 | // c, cell[(r * columns) + c], colType, cellType); |
---|
473 | // colType= cellType; |
---|
474 | // } |
---|
475 | // // DEBUG DEBUG DEBUG |
---|
476 | } |
---|
477 | |
---|
478 | columnType[c]= colType; |
---|
479 | } |
---|
480 | } |
---|
481 | |
---|
482 | |
---|
483 | |
---|
484 | |
---|