1 | #!/usr/bin/perl |
---|
2 | |
---|
3 | use warnings; |
---|
4 | use strict; |
---|
5 | |
---|
6 | my $line = undef; |
---|
7 | my $titleline = undef; |
---|
8 | my $pdb_id = undef; |
---|
9 | my $header = undef; |
---|
10 | my $compnd = undef; |
---|
11 | my $source = undef; |
---|
12 | my $date = undef; |
---|
13 | my $author = undef; |
---|
14 | my $reference = undef; |
---|
15 | my @chains = (' '); |
---|
16 | my @sequences = (''); |
---|
17 | my @secstructs = (''); |
---|
18 | my $seqnum = 0; |
---|
19 | my $mode = 0; |
---|
20 | |
---|
21 | foreach $line (<STDIN>) { |
---|
22 | chomp($line); |
---|
23 | if ($mode==0) { |
---|
24 | if ($line =~ /^(==== Secondary Structure Definition.+)\s+\.$/) { |
---|
25 | $titleline = $1; |
---|
26 | } |
---|
27 | elsif ($line =~ /^REFERENCE\s+(.+);*\s+\.$/) { |
---|
28 | $reference = $1; |
---|
29 | $reference =~ s/(;*\s*$)//; # remove semicolon and whitespace at the end (if present) |
---|
30 | } |
---|
31 | elsif ($line =~ /^HEADER\s+(.+)\s+(\d\d-\w\w\w-\d\d)\s+(\w{4})\s+\.$/) { |
---|
32 | $header = $1; |
---|
33 | $date = $2; |
---|
34 | $pdb_id = $3; |
---|
35 | $header =~ s/(;*\s*$)//; # remove semicolon and whitespace at the end (if present) |
---|
36 | } |
---|
37 | elsif ($line =~ /^COMPND\s+\d?\s+(.+);*\s+\.$/) { |
---|
38 | $compnd = $1; |
---|
39 | $compnd =~ s/(;*\s*$)//; # remove semicolon and whitespace at the end (if present) |
---|
40 | } |
---|
41 | elsif ($line =~ /^SOURCE\s+\d?\s+(.+);*\s+\.$/) { |
---|
42 | $source = $1; |
---|
43 | $source =~ s/(;*\s*$)//; # remove semicolon and whitespace at the end (if present) |
---|
44 | } |
---|
45 | elsif ($line =~ /^AUTHOR\s+(.+);*\s+\.$/) { |
---|
46 | $author = $1; |
---|
47 | $author =~ s/(;*\s*$)//; # remove semicolon and whitespace at the end (if present) |
---|
48 | $mode++; |
---|
49 | } |
---|
50 | } |
---|
51 | elsif ($mode==1) { |
---|
52 | if ($line =~ /RESIDUE AA/) { |
---|
53 | $mode++; |
---|
54 | } |
---|
55 | } |
---|
56 | elsif ($mode==2) { |
---|
57 | if ($line =~ /^.{11}(.)\s([A-Z!])..([A-Z\s])/io) { |
---|
58 | if ($2 eq '!') { # chain break encountered (-> start new protein sequence) |
---|
59 | $seqnum++; |
---|
60 | $sequences[$seqnum] = ''; |
---|
61 | $secstructs[$seqnum] = ''; |
---|
62 | $chains[$seqnum] = ' '; |
---|
63 | } |
---|
64 | else { # append protein sequence and secondary structure |
---|
65 | $sequences[$seqnum] .= $2; |
---|
66 | $secstructs[$seqnum] .= $3 eq ' ' ? '-' : $3; |
---|
67 | if ($1 ne $chains[$seqnum]) { |
---|
68 | $chains[$seqnum] = $1; |
---|
69 | } |
---|
70 | } |
---|
71 | } |
---|
72 | else { |
---|
73 | die "Can't parse '$line'"; |
---|
74 | } |
---|
75 | } |
---|
76 | } |
---|
77 | |
---|
78 | if (not defined $titleline) { die "Could not find title line"; } |
---|
79 | if (not defined $pdb_id) { die "Could not extract PDB_ID entry from HEADER"; } |
---|
80 | if (not defined $header) { die "Could not find HEADER entry"; } |
---|
81 | if (not defined $compnd) { die "Could not find COMPND entry"; } |
---|
82 | if (not defined $source) { die "Could not find SOURCE entry"; } |
---|
83 | if (not defined $date) { die "Could not extract DATE entry from HEADER"; } |
---|
84 | if (not defined $author) { die "Could not find AUTHOR entry"; } |
---|
85 | if (not defined $reference) { die "Could not find REFERENCE entry"; } |
---|
86 | |
---|
87 | if ($mode!=2) { die "Unknown parse error"; } |
---|
88 | |
---|
89 | for (my $i = 0; $i <= $seqnum; $i++) { |
---|
90 | print "$titleline\n"; |
---|
91 | print "REFERENCE $reference\n"; |
---|
92 | if ($chains[$i] ne ' ') { |
---|
93 | print "PDB_ID $pdb_id\_$chains[$i]\n"; |
---|
94 | } |
---|
95 | else { |
---|
96 | print "PDB_ID $pdb_id\n"; |
---|
97 | } |
---|
98 | print "DATE $date\n"; |
---|
99 | print "HEADER $header\n"; |
---|
100 | print "COMPND $compnd\n"; |
---|
101 | print "SOURCE $source\n"; |
---|
102 | print "AUTHOR $author\n"; |
---|
103 | print "SECSTRUCT $secstructs[$i]\n"; |
---|
104 | print "SEQUENCE\n$sequences[$i]\n"; |
---|
105 | } |
---|