1 | /*! \file ed4_protein_2nd_structure.hxx |
---|
2 | * \brief Adds support for protein structure prediction, comparison of two |
---|
3 | * protein secondary structures and of amino acid sequences with protein |
---|
4 | * secondary structures as well as visualization of the match quality in EDIT4. |
---|
5 | * \author Markus Urban |
---|
6 | * \date 2008-02-08 |
---|
7 | * |
---|
8 | * This file contains functions that predict a protein secondary structure from |
---|
9 | * its primary structure (i.e. the amino acid sequence) and for visualizing |
---|
10 | * how good a sequence matches a given secondary structure. Two secondary |
---|
11 | * structures can be compared, too. The initial values for the match symbols |
---|
12 | * and other settings are defined here, as well as functions that create a |
---|
13 | * "Protein Match Settings" window allowing the user to change the default |
---|
14 | * properties for match computation. |
---|
15 | * |
---|
16 | * \sa The functions for protein structure prediction are based on a statistical |
---|
17 | * method known as Chou-Fasman algorithm. For details refer to "Chou, P. and |
---|
18 | * Fasman, G. (1978). Prediction of the secondary structure of proteins from |
---|
19 | * their amino acid sequence. Advanced Enzymology, 47, 45-148.". |
---|
20 | * |
---|
21 | * \attention The used method for secondary structure prediction is fast which |
---|
22 | * was the main reason for choosing it. Performance is important |
---|
23 | * for a large number of sequences loaded in the editor. However, it |
---|
24 | * is not very accurate and should only be used as rough estimation. |
---|
25 | * For our purpose, the algorithm as well as own adaptions to it are |
---|
26 | * used to get an approximate overview if a given amino acid sequence |
---|
27 | * does not match a certain secondary structure. |
---|
28 | */ |
---|
29 | |
---|
30 | #ifndef ED4_PROTEIN_2ND_STRUCTURE_HXX |
---|
31 | #define ED4_PROTEIN_2ND_STRUCTURE_HXX |
---|
32 | |
---|
33 | #ifndef AW_WINDOW_HXX |
---|
34 | #include "aw_window.hxx" |
---|
35 | #endif |
---|
36 | |
---|
37 | // #define SHOW_PROGRESS ///< Print information about progress to screen (for finding and extending structures and resolving overlaps). |
---|
38 | |
---|
39 | #define PFOLD_AWAR_ENABLE "Pfold/enable" //!< Enable structure match. |
---|
40 | #define PFOLD_AWAR_SELECTED_SAI "Pfold/selected_SAI" //!< Selected reference protein secondary structure SAI (i.e. the SAI that is used for structure comparison). |
---|
41 | #define PFOLD_AWAR_PAIR_TEMPLATE "Pfold/pairs/%s" //!< Structure pairs that define the match quality (see #pfold_pairs) as used for match methods #SECSTRUCT_SECSTRUCT and #SECSTRUCT_SEQUENCE_PREDICT. |
---|
42 | #define PFOLD_AWAR_SYMBOL_TEMPLATE "Pfold/symbols/%s" //!< Symbols for the match quality as used for match methods #SECSTRUCT_SECSTRUCT and #SECSTRUCT_SEQUENCE_PREDICT. |
---|
43 | #define PFOLD_AWAR_SYMBOL_TEMPLATE_2 "Pfold/symbols2" //!< Symbols for the match quality as used for match method #SECSTRUCT_SEQUENCE. |
---|
44 | #define PFOLD_AWAR_MATCH_METHOD "Pfold/match_method" //!< Selected method for computing the match quality (see #PFOLD_MATCH_METHOD). |
---|
45 | #define PFOLD_AWAR_SAI_FILTER "Pfold/SAI_filter" //!< Filter SAIs for given criteria (string); used in option menu for SAI selection. |
---|
46 | |
---|
47 | // TODO: move static variables to .cpp file? |
---|
48 | |
---|
49 | /*! \brief Protein secondary structure types. |
---|
50 | * |
---|
51 | * Defines the various types of protein secondary structure. The order |
---|
52 | * (or at least the individual values) are important, because they are |
---|
53 | * used to access various arrays. |
---|
54 | */ |
---|
55 | enum PFOLD_STRUCTURE { |
---|
56 | ALPHA_HELIX = 0, //!< Alpha-helix |
---|
57 | BETA_SHEET = 1, //!< Beta-sheet |
---|
58 | BETA_TURN = 2, //!< Beta-turn |
---|
59 | STRUCTURE_SUMMARY = 3, //!< Structure summary |
---|
60 | // THREE_TURN = 4, ///< Three turn |
---|
61 | // FOUR_TURN = 5, ///< Four turn |
---|
62 | // FIVE_TURN = 6 ///< Five turn |
---|
63 | }; |
---|
64 | |
---|
65 | //! Defines a name-value pair (e.g. for awars, menu entries, etc.). |
---|
66 | struct name_value_pair { |
---|
67 | const char *name; //!< Name or description |
---|
68 | int value; //!< Value attached to \a name |
---|
69 | }; |
---|
70 | |
---|
71 | //! Match quality for secondary structure match. |
---|
72 | enum PFOLD_MATCH_TYPE { |
---|
73 | STRUCT_PERFECT_MATCH, //!< Perfect match |
---|
74 | STRUCT_GOOD_MATCH, //!< Good match |
---|
75 | STRUCT_MEDIUM_MATCH, //!< Medium match |
---|
76 | STRUCT_BAD_MATCH, //!< Bad match |
---|
77 | STRUCT_NO_MATCH, //!< No match |
---|
78 | STRUCT_UNKNOWN, //!< Unknown structure |
---|
79 | PFOLD_MATCH_TYPE_COUNT //!< Number of match types |
---|
80 | }; |
---|
81 | |
---|
82 | //! Awars for the match type; binds the #PFOLD_MATCH_TYPE to the corresponding awar name. |
---|
83 | extern name_value_pair pfold_match_type_awars[]; |
---|
84 | |
---|
85 | #define PFOLD_PAIRS 6 |
---|
86 | |
---|
87 | //! Match pair definition (see #PFOLD_MATCH_TYPE) as used for match methods #SECSTRUCT_SECSTRUCT and #SECSTRUCT_SEQUENCE_PREDICT in ED4_pfold_calculate_secstruct_match(). |
---|
88 | extern char *pfold_pairs[PFOLD_PAIRS]; |
---|
89 | |
---|
90 | //! Symbols for the match quality (defined by #PFOLD_MATCH_TYPE) as used for match methods #SECSTRUCT_SECSTRUCT and #SECSTRUCT_SEQUENCE_PREDICT in ED4_pfold_calculate_secstruct_match(). |
---|
91 | extern char *pfold_pair_chars[PFOLD_PAIRS]; |
---|
92 | |
---|
93 | /*! \brief Symbols for the match quality as used for match method |
---|
94 | * #SECSTRUCT_SEQUENCE in ED4_pfold_calculate_secstruct_match(). |
---|
95 | * |
---|
96 | * The ten symbols represent the match quality ranging from 0 - 100% in |
---|
97 | * steps of 10%. |
---|
98 | */ |
---|
99 | |
---|
100 | #define PFOLD_PAIR_CHARS_2 "##++~~-- " |
---|
101 | |
---|
102 | //! Defines the methods for match computation. For details refer to ED4_pfold_calculate_secstruct_match(). |
---|
103 | enum PFOLD_MATCH_METHOD { |
---|
104 | SECSTRUCT_SECSTRUCT, //!< Compare two protein secondary structures |
---|
105 | SECSTRUCT_SEQUENCE, //!< Compare an amino acid sequence with a reference protein secondary structure |
---|
106 | SECSTRUCT_SEQUENCE_PREDICT, //!< Compare a full prediction of the protein secondary structure from its amino acid sequence with a reference protein secondary structure |
---|
107 | PFOLD_MATCH_METHOD_COUNT //!< Number of match methods |
---|
108 | }; |
---|
109 | |
---|
110 | /*! \brief Returns the former value of an amino acid depending on the given structure type. |
---|
111 | * |
---|
112 | * The definition is used for method #SECSTRUCT_SEQUENCE in |
---|
113 | * ED4_pfold_calculate_secstruct_match() to get the former value of an amino acid |
---|
114 | * depending on the found structure type at its position. It addresses #cf_parameters |
---|
115 | * for #ALPHA_HELIX and #BETA_SHEET and #cf_parameters_norm for #BETA_TURN. |
---|
116 | */ |
---|
117 | #define cf_former(aa, strct) ((strct!=2) ? cf_parameters[aa][strct] : cf_parameters_norm[aa][strct]) |
---|
118 | |
---|
119 | /*! \brief Returns the breaker value of an amino acid depending on the given structure type. |
---|
120 | * |
---|
121 | * The definition is used for method #SECSTRUCT_SEQUENCE in |
---|
122 | * ED4_pfold_calculate_secstruct_match() to get the breaker value of an amino acid |
---|
123 | * depending on the found structure type at its position. It addresses #cf_parameters |
---|
124 | * for #ALPHA_HELIX and #BETA_SHEET and returns 0 for #BETA_SHEET, because it has no |
---|
125 | * breaker values. |
---|
126 | */ |
---|
127 | #define cf_breaker(aa, strct) ((strct!=2) ? cf_parameters[aa][strct+2] : 0) |
---|
128 | |
---|
129 | |
---|
130 | // General ED4 functions ////////////////////////////////////////////////////////////////////////////////////////////////////////// |
---|
131 | |
---|
132 | |
---|
133 | |
---|
134 | /*! \brief Compares a protein secondary structure with a primary structure or another |
---|
135 | * secondary structure. |
---|
136 | * |
---|
137 | * \param[in] structure_sai Reference protein structure SAI (secondary structure) |
---|
138 | * \param[in] structure_cmp Protein structure to compare (primary or secondary structure) |
---|
139 | * \param[in] start The start of the match computation (visible area in editor) |
---|
140 | * \param[in] end The end of the match computation (visible area in editor) |
---|
141 | * \param[out] result_buffer Result buffer for match symbols |
---|
142 | * \param[in] match_method Method for structure match computation |
---|
143 | * \return Error description, if an error occurred; 0 otherwise |
---|
144 | * |
---|
145 | * This function compares a protein secondary structure with a primary structure |
---|
146 | * (= amino acid sequence) or another secondary structure depending on \a match_method. |
---|
147 | * |
---|
148 | * \par Match method SECSTRUCT_SECSTRUCT: |
---|
149 | * Two secondary structures are compared one by one using the criteria defined |
---|
150 | * by #pfold_pairs. The match symbols are taken from #pfold_pair_chars. |
---|
151 | * |
---|
152 | * \par Match method SECSTRUCT_SEQUENCE: |
---|
153 | * An amino acid sequence is compared with a secondary structure by taking |
---|
154 | * cohesive parts of the structure - gaps in the alignment are skipped - and |
---|
155 | * computing the normalized difference of former and breaker values for this |
---|
156 | * region in the given sequence such that a value from 0 - 100% for the |
---|
157 | * match quality is generated. By dividing this value into steps of 10% |
---|
158 | * it is mapped to the match symbols defined by #PFOLD_PAIR_CHARS_2. Note |
---|
159 | * that bends ('S') are assumed to fit everywhere (=> best match symbol), and |
---|
160 | * if a structure is encountered but no corresponding amino acid the worst match |
---|
161 | * symbol is chosen. |
---|
162 | * |
---|
163 | * \par Match method SECSTRUCT_SEQUENCE_PREDICT: |
---|
164 | * An amino acid sequence is compared with a secondary structure using a full |
---|
165 | * prediction of the secondary structure from its sequence via |
---|
166 | * ED4_pfold_predict_structure() and comparing it one by one with the reference |
---|
167 | * structure. Note that not the structure summary is used for comparison, but |
---|
168 | * the individual predicted structure types as returned in \a structures[4]. |
---|
169 | * The match criteria are defined in #pfold_pairs which is searched in ascending |
---|
170 | * order, i.e. good matches first, then the worse ones. If a match is found |
---|
171 | * the corresponding match symbol (as defined by #pfold_pair_chars) is chosen. |
---|
172 | * Note that if a structure is encountered but no corresponding amino acid the |
---|
173 | * worst match symbol is chosen. |
---|
174 | * |
---|
175 | * The match criteria (for #SECSTRUCT_SECSTRUCT and #SECSTRUCT_SEQUENCE_PREDICT) as |
---|
176 | * well as the match symbols (for all methods) can be adjusted by the user in the |
---|
177 | * "Protein Match Settings" dialog. The result of the match computation (i.e. the |
---|
178 | * match symbols) is written to the result buffer. |
---|
179 | */ |
---|
180 | GB_ERROR ED4_pfold_calculate_secstruct_match(const unsigned char *structure_sai, const unsigned char *structure_cmp, int start, int end, char *result_buffer, PFOLD_MATCH_METHOD match_method = SECSTRUCT_SEQUENCE); |
---|
181 | |
---|
182 | /*! \brief Sets the reference protein secondary structure SAI. |
---|
183 | * |
---|
184 | * \param[out] protstruct Pointer to reference protein secondary structure SAI |
---|
185 | * \param[in] gb_main Main database |
---|
186 | * \param[in] alignment_name Name of the alignment to search for |
---|
187 | * \param[out] protstruct_len Length of reference protein secondary structure SAI |
---|
188 | * \return Error description, if an error occurred; 0 otherwise |
---|
189 | * |
---|
190 | * The function searches the database \a gb_main for the currently selected SAI |
---|
191 | * as defined by #PFOLD_AWAR_SELECTED_SAI and assigns the data of the alignment |
---|
192 | * \a alignment_name to \a protstruct. If \a protstruct_len is specified the length |
---|
193 | * of the new reference SAI is stored. The function is used in the editor to |
---|
194 | * initialize the reference protein secondary structure SAI and to update it if the |
---|
195 | * selected SAI is changed in the "Protein Match Settings" dialog. For this purpose |
---|
196 | * it should be called with &ED4_ROOT->protstruct and &ED4_ROOT->protstruct_len. |
---|
197 | */ |
---|
198 | GB_ERROR ED4_pfold_set_SAI(char **protstruct, GBDATA *gb_main, const char *alignment_name, long *protstruct_len = NULp); |
---|
199 | |
---|
200 | |
---|
201 | // AW related functions /////////////////////////////////////////////////////////////////////////////////////////////// |
---|
202 | |
---|
203 | /*! \brief Creates the "Protein Match Settings" window. |
---|
204 | * |
---|
205 | * \param[in] awr Root window |
---|
206 | * \param[in] cb Callback struct |
---|
207 | * \return Window |
---|
208 | * |
---|
209 | * The "Protein Match Settings" window allows the user to configure the properties |
---|
210 | * for protein match computation. These settings include turning the match |
---|
211 | * computation on and off (bound to awar #PFOLD_AWAR_ENABLE), selecting the reference |
---|
212 | * protein secondary structure SAI (bound to awar #PFOLD_AWAR_SELECTED_SAI), choosing |
---|
213 | * the match method (bound to awar #PFOLD_AWAR_MATCH_METHOD, see #PFOLD_MATCH_METHOD) |
---|
214 | * and the definition of the match pairs (bound to awar #PFOLD_AWAR_PAIR_TEMPLATE |
---|
215 | * and #pfold_match_type_awars, see #PFOLD_MATCH_TYPE and #pfold_pairs) as well as |
---|
216 | * the match symbols (bound to awar #PFOLD_AWAR_SYMBOL_TEMPLATE and |
---|
217 | * #pfold_match_type_awars or #PFOLD_AWAR_SYMBOL_TEMPLATE_2, see #PFOLD_MATCH_TYPE |
---|
218 | * and #pfold_pair_chars or #PFOLD_PAIR_CHARS_2). Via a filter (bound to |
---|
219 | * #PFOLD_AWAR_SAI_FILTER) the SAIs shown in the option menu can be narrowed down to |
---|
220 | * a selection of SAIs whose names contain the specified string. The callback function |
---|
221 | * #ED4_pfold_select_SAI_and_update_option_menu() is bound to the SAI option menu and |
---|
222 | * the SAI filter to update the selected SAI in the editor or the selection in the |
---|
223 | * SAI option menu. |
---|
224 | */ |
---|
225 | AW_window *ED4_pfold_create_props_window(AW_root *awr, const WindowCallback *refreshCallback); |
---|
226 | |
---|
227 | #endif |
---|