| 1 | #!/usr/bin/perl |
|---|
| 2 | |
|---|
| 3 | use warnings; |
|---|
| 4 | use strict; |
|---|
| 5 | |
|---|
| 6 | my $line = undef; |
|---|
| 7 | my $titleline = undef; |
|---|
| 8 | my $pdb_id = undef; |
|---|
| 9 | my $header = undef; |
|---|
| 10 | my $compnd = undef; |
|---|
| 11 | my $source = undef; |
|---|
| 12 | my $date = undef; |
|---|
| 13 | my $author = undef; |
|---|
| 14 | my $reference = undef; |
|---|
| 15 | my @chains = (' '); |
|---|
| 16 | my @sequences = (''); |
|---|
| 17 | my @secstructs = (''); |
|---|
| 18 | my $seqnum = 0; |
|---|
| 19 | my $mode = 0; |
|---|
| 20 | |
|---|
| 21 | foreach $line (<STDIN>) { |
|---|
| 22 | chomp($line); |
|---|
| 23 | if ($mode==0) { |
|---|
| 24 | if ($line =~ /^(==== Secondary Structure Definition.+)\s+\.$/) { |
|---|
| 25 | $titleline = $1; |
|---|
| 26 | } |
|---|
| 27 | elsif ($line =~ /^REFERENCE\s+(.+);*\s+\.$/) { |
|---|
| 28 | $reference = $1; |
|---|
| 29 | $reference =~ s/(;*\s*$)//; # remove semicolon and whitespace at the end (if present) |
|---|
| 30 | } |
|---|
| 31 | elsif ($line =~ /^HEADER\s+(.+)\s+(\d\d-\w\w\w-\d\d)\s+(\w{4})\s+\.$/) { |
|---|
| 32 | $header = $1; |
|---|
| 33 | $date = $2; |
|---|
| 34 | $pdb_id = $3; |
|---|
| 35 | $header =~ s/(;*\s*$)//; # remove semicolon and whitespace at the end (if present) |
|---|
| 36 | } |
|---|
| 37 | elsif ($line =~ /^COMPND\s+\d?\s+(.+);*\s+\.$/) { |
|---|
| 38 | $compnd = $1; |
|---|
| 39 | $compnd =~ s/(;*\s*$)//; # remove semicolon and whitespace at the end (if present) |
|---|
| 40 | } |
|---|
| 41 | elsif ($line =~ /^SOURCE\s+\d?\s+(.+);*\s+\.$/) { |
|---|
| 42 | $source = $1; |
|---|
| 43 | $source =~ s/(;*\s*$)//; # remove semicolon and whitespace at the end (if present) |
|---|
| 44 | } |
|---|
| 45 | elsif ($line =~ /^AUTHOR\s+(.+);*\s+\.$/) { |
|---|
| 46 | $author = $1; |
|---|
| 47 | $author =~ s/(;*\s*$)//; # remove semicolon and whitespace at the end (if present) |
|---|
| 48 | $mode++; |
|---|
| 49 | } |
|---|
| 50 | } |
|---|
| 51 | elsif ($mode==1) { |
|---|
| 52 | if ($line =~ /RESIDUE AA/) { |
|---|
| 53 | $mode++; |
|---|
| 54 | } |
|---|
| 55 | } |
|---|
| 56 | elsif ($mode==2) { |
|---|
| 57 | if ($line =~ /^.{11}(.)\s([A-Z!])..([A-Z\s])/io) { |
|---|
| 58 | if ($2 eq '!') { # chain break encountered (-> start new protein sequence) |
|---|
| 59 | $seqnum++; |
|---|
| 60 | $sequences[$seqnum] = ''; |
|---|
| 61 | $secstructs[$seqnum] = ''; |
|---|
| 62 | $chains[$seqnum] = ' '; |
|---|
| 63 | } |
|---|
| 64 | else { # append protein sequence and secondary structure |
|---|
| 65 | $sequences[$seqnum] .= $2; |
|---|
| 66 | $secstructs[$seqnum] .= $3 eq ' ' ? '-' : $3; |
|---|
| 67 | if ($1 ne $chains[$seqnum]) { |
|---|
| 68 | $chains[$seqnum] = $1; |
|---|
| 69 | } |
|---|
| 70 | } |
|---|
| 71 | } |
|---|
| 72 | else { |
|---|
| 73 | die "Can't parse '$line'"; |
|---|
| 74 | } |
|---|
| 75 | } |
|---|
| 76 | } |
|---|
| 77 | |
|---|
| 78 | if (not defined $titleline) { die "Could not find title line"; } |
|---|
| 79 | if (not defined $pdb_id) { die "Could not extract PDB_ID entry from HEADER"; } |
|---|
| 80 | if (not defined $header) { die "Could not find HEADER entry"; } |
|---|
| 81 | if (not defined $compnd) { die "Could not find COMPND entry"; } |
|---|
| 82 | if (not defined $source) { die "Could not find SOURCE entry"; } |
|---|
| 83 | if (not defined $date) { die "Could not extract DATE entry from HEADER"; } |
|---|
| 84 | if (not defined $author) { die "Could not find AUTHOR entry"; } |
|---|
| 85 | if (not defined $reference) { die "Could not find REFERENCE entry"; } |
|---|
| 86 | |
|---|
| 87 | if ($mode!=2) { die "Unknown parse error"; } |
|---|
| 88 | |
|---|
| 89 | for (my $i = 0; $i <= $seqnum; $i++) { |
|---|
| 90 | print "$titleline\n"; |
|---|
| 91 | print "REFERENCE $reference\n"; |
|---|
| 92 | if ($chains[$i] ne ' ') { |
|---|
| 93 | print "PDB_ID $pdb_id\_$chains[$i]\n"; |
|---|
| 94 | } else { |
|---|
| 95 | print "PDB_ID $pdb_id\n"; |
|---|
| 96 | } |
|---|
| 97 | print "DATE $date\n"; |
|---|
| 98 | print "HEADER $header\n"; |
|---|
| 99 | print "COMPND $compnd\n"; |
|---|
| 100 | print "SOURCE $source\n"; |
|---|
| 101 | print "AUTHOR $author\n"; |
|---|
| 102 | print "SECSTRUCT $secstructs[$i]\n"; |
|---|
| 103 | print "SEQUENCE\n$sequences[$i]\n"; |
|---|
| 104 | } |
|---|