Context Navigation

← Previous Revision
Next Revision →
Blame
Revision Log

import_from_table.pl

Visit:

Last change on this file was 19502, checked in by westram, 8 months ago
import from calc-sheet now interprets lines ending with ',' as empty cell
Property svn:executable set to ``*
File size: 18.6 KB

Line
1	#!/usr/bin/perl
2	# =============================================================== #
3	# #
4	# File : import_from_table.pl #
5	# Purpose : import data from comma- or tab-separated tables #
6	# #
7	# Coded by Ralf Westram (coder@reallysoft.de) in January 2011 #
8	# Institute of Microbiology (Technical University Munich) #
9	# http://www.arb-home.de/ #
10	# #
11	# =============================================================== #
12
13	use strict;
14	use warnings;
15
16	BEGIN {
17	if (not exists $ENV{'ARBHOME'}) { die "Environment variable \$ARBHOME has to be defined"; }
18	my $arbhome = $ENV{'ARBHOME'};
19	push @INC, "$arbhome/lib";
20	push @INC, "$arbhome/PERL_SCRIPTS/lib";
21	1;
22	}
23
24	use ARB;
25	use tools;
26
27	sub usage() {
28	print(
29	"Usage: perl import_from_table.pl --match CF --write CF [options] datafile [database outdb]\n".
30	"\n".
31	"Imports one column from the calc-sheet 'datafile' into an ARB database.\n".
32	"\n".
33	"'datafile' should be a list of tab-separated values.\n".
34	"'database' if a name is specified, the modified DB will be saved as 'outdb'.\n".
35	" Otherwise the database running in ARB will be modified.\n".
36	"\n".
37	"--match CF CF:=column,field\n".
38	" Define a 'column' in the 'datafile' and a species-'field' in the database.\n".
39	" For each row the content of the 'column' has to match the content of the\b".
40	" 'field' for exactly one species in the 'database'.\n".
41	" Useful fields are 'acc' and 'name'.\n".
42	"--write CF CF:=column,field\n".
43	" For each row in 'datafile' write the content of 'column' into the\n".
44	" 'field' of the species matched via --match\n".
45	"--mode MM MM:=the match mode\n".
46	" Use '=' for plain string comparison (traditional behavior)\n".
47	" Also supports wordwise comparison - see http://help.arb-home.de/agde_import_calc.html\n".
48	"\n".
49	"Available 'options':\n".
50	"--csv expect 'datafile' is a list of comma-separated values (default: TAB-separated)\n".
51	"--overwrite overwrite 'field' specified via --write (default: abort if 'field' exists)\n".
52	"--skip-unknown silently skip rows that don't match any species (default: abort if no match found)\n".
53	"--skip-empty silently skip rows where match column is empty (default: error if occurs multiple)\n".
54	"--marked-only only write to marked species (default: all species)\n".
55	"--mark mark species to which field has been imported (unmarks rest)\n".
56	"--as-integer use INTEGER database-type for field (default: STRING)\n"
57	);
58
59	}
60
61	sub max($$) { my ($a,$b) = @_; return $a<$b ? $b : $a; }
62
63	sub parse_CF($$) {
64	my ($switch,$CF) = @_;
65	my ($column,$field);
66	eval {
67	if ($CF =~ /^([^,]+),(.*)$/o) {
68	($column,$field) = ($1,$2);
69	my $int_column = int($column);
70	if ($int_column<1) { die "'$column' is not a valid column\n"; }
71	my $error = ARB::check_key($field);
72	if (defined $error) { die "'$field' is not a valid DB field name\n"; }
73	}
74	else { die "',' expected in '$CF'\n"; }
75	};
76	if ($@) { die "in '$switch $CF': $@\n"; }
77	return ($column,$field);
78	}
79
80	# ---------------------------------------- [customized column parser]
81
82	my $reg_plain_normal = undef;
83	my $reg_plain_lastCol = undef;
84	my $reg_quoted_normal = undef;
85	my $reg_quoted_lastCol = undef;
86
87	sub set_separator($) {
88	my ($sep) = @_;
89
90	my $plain_rex = "^([^$sep\"\r\n]*)"; # plain, probably empty column
91	my $quoted_rex = "^\"((\"\"\|[^\"]))\""; # quoted column (may contain "" inside quotes)
92
93	$reg_plain_lastCol = qr/$plain_rex\n/;
94	$reg_quoted_lastCol = qr/$quoted_rex\n/;
95
96	$reg_plain_normal = qr/$plain_rex$sep/;
97	$reg_quoted_normal = qr/$quoted_rex$sep/;
98	}
99
100	sub parse_column(\$) {
101	my ($line_r) = @_;
102	my ($quoted,$lastCol) = (-1,-1);
103
104	if ($$line_r =~ $reg_plain_normal) { $quoted = 0; $lastCol = 0; }
105	elsif ($$line_r =~ $reg_quoted_normal) { $quoted = 1; $lastCol = 0; }
106	elsif ($$line_r =~ $reg_plain_lastCol) { $quoted = 0; $lastCol = 1; }
107	elsif ($$line_r =~ $reg_quoted_lastCol) { $quoted = 1; $lastCol = 1; }
108
109	if ($quoted == -1) {
110	return (undef, undef, undef);
111	}
112
113	my $col = $1;
114	$$line_r = $';
115	if ($quoted == 1) {
116	if ($col =~ /\"/) {
117	$col =~ s/\"\"/\"/og;
118	}
119	}
120	return ($col, $quoted, $lastCol);
121	}
122
123	# ---------------------------------------- [custom CLI flags]
124
125	my ($matchcolumn,$matchfield);
126	my ($writecolumn,$writefield);
127
128	my $skip_unknown = 0;
129	my $skip_empty = 0;
130	my $overwrite = 0;
131	my $marked_only = 0;
132	my $mark = 0;
133	my $help_requested = 0;
134	my $int_type = 0;
135
136	# The next variables either
137	# - are undef if field/cell shall not get splitted, or
138	# - contain regexpr to be used with split().
139	my $reg_split_field = undef;
140	my $reg_split_cell = undef;
141	my $matchmode = undef;
142	my $ignore_dup_words = 0;
143
144	# ---------------------------------------- [custom CLI flags end]
145
146	sub main() {
147	my $datafile;
148	my $database = ':';
149	my $database_out = undef;
150
151	my @no_option = ();
152
153	set_separator("\t");
154
155	eval {
156	while (scalar(@ARGV)>0) {
157	my $arg = shift @ARGV;
158	if ($arg eq '--match') { ($matchcolumn,$matchfield) = parse_CF($arg, shift @ARGV); }
159	elsif ($arg eq '--write') { ($writecolumn,$writefield) = parse_CF($arg, shift @ARGV); }
160	elsif ($arg eq '--mode') { $matchmode = shift @ARGV; }
161	elsif ($arg eq '--csv') { set_separator(','); }
162	elsif ($arg eq '--overwrite') { $overwrite = 1; }
163	elsif ($arg eq '--skip-unknown') { $skip_unknown = 1; }
164	elsif ($arg eq '--skip-empty') { $skip_empty = 1; }
165	elsif ($arg eq '--marked-only') { $marked_only = 1; }
166	elsif ($arg eq '--mark') { $mark = 1; }
167	elsif ($arg eq '--as-integer') { $int_type = 1; }
168	elsif ($arg eq '--help') { $help_requested = 1; return; } # only returns from eval!
169	else { push @no_option, $arg; }
170	}
171
172	foreach (@no_option) {
173	if (/^--/) {
174	die "Unknown switch '$_'\n";
175	}
176	}
177
178	$datafile = shift @no_option;
179	if (not defined $datafile) { die "Missing argument 'datafile'\n"; }
180
181	if (scalar(@no_option)) {
182	$database = shift @no_option;
183	$database_out = shift @no_option;
184	if (not defined $database_out) { die "Missing argument 'outdb'\n"; }
185	}
186
187	if (scalar(@no_option)) { die "Unexpected arguments: ".join(',', @no_option)."\n"; }
188
189	if (not defined $matchcolumn) { die "Mandatory option '--match CF' missing\n"; }
190	if (not defined $writecolumn) { die "Mandatory option '--write CF' missing\n"; }
191
192	if (not defined $matchmode) { $matchmode = '='; }
193	if ($matchmode ne '=') {
194	eval {
195	my $mm = $matchmode;
196	while ($mm ne '') {
197	if ($mm =~ /^([cf])([wd=])/o) {
198	my ($target, $mode) = ($1, $2);
199	$mm = $';
200	my $target_reg = $target eq 'c' ? \$reg_split_cell : \$reg_split_field;
201	if ($mode eq '=') {
202	$$target_reg = undef;
203	}
204	else {
205	if ($mm =~ /^(.)/o) {
206	my $sep = $1;
207	$mm = $';
208	my $regexpr = qr/$sep/;
209	$$target_reg = $regexpr;
210	if ($mode eq 'd') {
211	if ($target ne 'c') { die "invalid use of '$mode' after '$target'\n"; }
212	$ignore_dup_words = 1;
213	}
214	}
215	else {
216	die "expected a separator char behind '$target$mode'\n";
217	}
218	}
219	}
220	else {
221	die "unexpected content seen at '$mm'\n";
222	}
223	}
224	};
225	if ($@) {
226	chomp $@;
227	die "could not handle matchmode '$matchmode' (Reason: $@)";
228	}
229	}
230	};
231
232	if ($@) {
233	die "\nError: $@(use --help to show usage)\n ";
234	}
235	if ($help_requested) {
236	usage();
237	}
238	else {
239	work($datafile, $database, $database_out);
240	}
241	}
242
243	sub trim($) {
244	my ($str) = @_;
245	$str =~ s/^[\s]+//go;
246	$str =~ s/[\s]+$//go;
247	return $str;
248	}
249
250	sub split_wordwise($$) {
251	my ($value, $reg_split) = @_;
252	my @splitted = split $reg_split, $value;
253	my %words = map {
254	my $word = trim($_);
255	if ($word eq '') { ; }
256	else { $word => 1; }
257	} @splitted;
258	return keys %words;
259	}
260
261	sub work($$$) {
262	my ($datafile, $database, $database_out) = @_;
263
264	my $inform_ARB = 0; # [previously was defined globally]
265	my $gb_main = ARB::open($database, "rw");
266	if ($database eq ':') {
267	if ($gb_main) { $inform_ARB = 1; }
268	else { expectError('db connect (no running ARB)'); }
269	}
270	else {
271	$gb_main \|\| expectError('db connect (wrong \'database\' specified?)');
272	}
273
274	my %write_table = (); # key=matchvalue, value=writevalue
275	my %source_line = (); # key=matchvalue, value=source-linenumber (corrected by joined_lines)
276	my %cell_word = (); # key=word from matchvalue (trimmed), value=matchvalue
277
278	my $joined_lines = 0;
279	my $no_content = 0;
280
281	eval {
282	if (not -f $datafile) { die "No such file '$datafile'\n"; }
283	open(TABLE,'<'.$datafile) \|\| die "can't open '$datafile' (Reason: $!)\n";
284
285	eval {
286	my $min_elems = max($matchcolumn,$writecolumn);
287	my $line;
288	my $current_line; # line number
289
290	while (defined($line=<TABLE>)) {
291	eval {
292	my @row = ();
293
294	my $done = 0;
295	while ($done == 0) {
296	my ($column, $wasQuoted, $wasLastCol) = parse_column($line);
297	# print "column='$column' wasQuoted=$wasQuoted wasLastCol=$wasLastCol\n";
298	if (defined $column) {
299	push @row, $column;
300
301	if ($wasLastCol==1 and $line =~ /^[\r\n]*$/o) { # only LF + CR or nothing left -> done with line
302	$done = 1;
303	}
304	}
305	else { # test whether line contains quoted LF -> join next line
306	if ($line =~ /\"/o) {
307	my $nextLine = <TABLE>;
308	if (not defined $nextLine) {
309	die "reached EOF while attempting to append multiline (quoted entry seems to contain LF/CR)\n";
310	}
311	$line .= $nextLine;
312	$joined_lines++;
313	# print "(detected quoted LF/CR -> appended next line)\n";
314	}
315	else {
316	die "cannot interpret rest of line: '$line'\n";
317	}
318	}
319	}
320
321	my $relems = scalar(@row);
322	if ($relems<$min_elems) {
323	die "need at least $min_elems columns per table-line\n".
324	"(seen only $relems column. Maybe wrong separator chosen?)\n";
325	}
326
327	my $matchvalue = $row[$matchcolumn-1];
328	my $writevalue = $row[$writecolumn-1];
329
330	if ($matchvalue eq '' and $skip_empty) {
331	# skip rows with empty match cell, if requested via option!
332	$no_content++;
333	return; # from eval block
334	}
335
336	if (exists $write_table{$matchvalue}) {
337	die "duplicated value '$matchvalue' in column $matchcolumn (first seen in row ".$source_line{$matchvalue}.")\n";
338	}
339
340	$current_line = $.-$joined_lines;
341	$write_table{$matchvalue} = $writevalue;
342	$source_line{$matchvalue} = $current_line;
343
344	if (defined $reg_split_cell) {
345	my @words = split_wordwise($matchvalue, $reg_split_cell);
346	foreach my $word (@words) {
347	if (exists $cell_word{$word}) {
348	my $first_occurred_line = $source_line{$cell_word{$word}};
349	my $dup_msg = "duplicated word '$word' (first seen in row $first_occurred_line)";
350
351	if ($ignore_dup_words) {
352	if ($cell_word{$word} ne '') {
353	print "Warning: line $current_line: ignoring $dup_msg\n";
354	}
355	}
356	else {
357	die "Error: $dup_msg\n";
358	}
359	}
360	else {
361	$cell_word{$word} = $matchvalue;
362	}
363	}
364	}
365	};
366	if ($@) { die "$@ (in row $current_line of '$datafile')\n"; }
367	}
368
369	# match and write to species
370	dieOnError(ARB::begin_transaction($gb_main), 'begin_transaction');
371
372	my $report = '';
373
374	eval {
375	my $ambiguous_hits = 0;
376	my %written = (); # key=matchvalue, value: 1=written, 2=skipped cause not marked (but did match)
377
378	for (my $gb_species = BIO::first_species($gb_main);
379	$gb_species;
380	$gb_species = BIO::next_species($gb_species)) {
381	eval {
382	my $species_value = BIO::read_as_string($gb_species, $matchfield);
383	my $wanted_mark = 0;
384	if ($species_value) {
385	my $matched_value = undef;
386
387	if (defined $reg_split_field) {
388	my @words = split_wordwise($species_value, $reg_split_field);
389	my %matched = (); # key=matchvalue, value=by which word
390	foreach my $word (@words) {
391	my $mv = undef;
392	if ($reg_split_cell) {
393	$mv = $cell_word{$species_value};
394	}
395	else {
396	if (exists $write_table{$species_value}) {
397	$mv = $species_value;
398	}
399	}
400	if (defined $mv) {
401	$matched{$mv} = $word;
402	}
403	}
404	my @matched = keys %matched;
405	my $matched = scalar(@matched);
406	if ($matched>0) {
407	if ($matched==1) {
408	$matched_value = $matched[0];
409	}
410	else {
411	my @ambig_words = values %matched;
412	my @quoted_words = map { $_ => "'$_'"; } @ambig_words;
413	my $ambig_words = join ", ", @quoted_words;
414	die "words in field '$matchfield' hit multiple table rows ($ambig_words)";
415	}
416	}
417	}
418	else {
419	if (defined $reg_split_cell) {
420	$matched_value = $cell_word{$species_value};
421	}
422	else {
423	if (exists $write_table{$species_value}) {
424	$matched_value = $species_value;
425	}
426	}
427	}
428
429	if (defined $matched_value) { # found table entry matching current species
430	if ($marked_only==1 and ARB::read_flag($gb_species)==0) {
431	$written{$matched_value} = 2;
432	}
433	else {
434	my $existing_entry = BIO::read_as_string($gb_species, $writefield);
435	if ($existing_entry and not $overwrite) {
436	die "already has an existing field '$writefield'.\n".
437	"Use --overwrite to allow replacement.\n";
438	}
439	my $error = undef;
440	if ($int_type==1) {
441	$error = BIO::write_int($gb_species, $writefield, int($write_table{$matched_value}));
442	}
443	else {
444	$error = BIO::write_string($gb_species, $writefield, $write_table{$matched_value});
445	}
446	if ($error) { die $error; }
447	$wanted_mark = 1;
448	my $prev_written = $written{$matched_value};
449	if (defined $prev_written) {
450	$ambiguous_hits++;
451	}
452	$written{$matched_value} = 1;
453	}
454	}
455	}
456	if ($mark==1) {
457	my $error = ARB::write_flag($gb_species,$wanted_mark);
458	if ($error) { die $error; }
459	}
460	};
461	if ($@) {
462	my $name = BIO::get_name_or_description($gb_species);
463	die "species '$name': $@";
464	}
465	}
466	my $not_found = 0;
467	my $not_marked = 0;
468	{
469	my %missing = ();
470	my $what = $skip_unknown ? 'Warning' : 'Error';
471	foreach (keys %write_table) {
472	my $wr = $written{$_};
473	if (not defined $wr) {
474	$missing{$_} = 1;
475	$not_found++;
476	}
477	elsif ($wr==2) { # was not marked
478	$not_marked++;
479	}
480	}
481
482	if ($not_found>0) {
483	my $shown = 0;
484	my $maxShown = 30;
485	my $show_all = 0;
486	if ($not_found<50 or $skip_unknown==0) {
487	$show_all = 1;
488	}
489
490	SHOWN: foreach (sort { $source_line{$a} <=> $source_line{$b}; } keys %missing) {
491	print "$what: Found no matching species for row ".$source_line{$_}." ($matchfield='$_')\n";
492	$shown++;
493	if ($show_all==0 and $shown>$maxShown) {
494	print "$what: (suppressing rest of $not_found messages)\n";
495	print "Hint: to list all unmatched rows, do NOT allow to skip them using --skip-unknown\n";
496	last SHOWN;
497	}
498	}
499	}
500	}
501	if ($not_found>0 and $skip_unknown==0) {
502	die "Failed to find $not_found species - aborting.\n".
503	"(Note: use --skip-unknown to allow unknown references)\n";
504	}
505	$report = "Entries imported: ".(scalar(keys %written)-$not_marked)."\n";
506	if ($ambiguous_hits>0) { $report .= "Ambiguous hits: $ambiguous_hits\n"; }
507	if ($no_content>0) { $report .= "Skipped rows with empty match-cell: $no_content\n"; }
508	if ($not_found>0) { $report .= "Unmatched (skipped) entries: $not_found\n"; }
509	if ($not_marked>0) { $report .= "Entries not imported because species were not marked: $not_marked\n"; }
510
511	print "\n".$report;
512	};
513	if ($@) {
514	ARB::abort_transaction($gb_main);
515	die $@;
516	}
517	ARB::commit_transaction($gb_main);
518	if ($database ne ':') { # database has been loaded
519	print "Saving modified database to '$database_out'\n";
520	my $error = ARB::save_as($gb_main, $database_out, "b");
521	if ($error) { die $error; }
522	}
523	ARB::close($gb_main);
524
525	if ($inform_ARB==1) {
526	$report =~ s/\n$//;
527	`arb_message "$report"`;
528	}
529	};
530	if ($@) {
531	close(TABLE);
532	die $@;
533	}
534	};
535	if ($@) {
536	ARB::close($gb_main);
537	die $@;
538	}
539	}
540
541	# call main()
542
543	eval {
544	set_inGlobalEvalState(1);
545	main();
546	};
547	set_inGlobalEvalState(0);
548	if ($@) {
549	die $@; # this die message calls arb_message (see ARB.pm)
550	exit(-1);
551	}
552	exit(0);

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: branches/lib/PERL_SCRIPTS/ARBTOOLS/import_from_table.pl

Download in other formats: