| 1 | /*! \file ed4_protein_2nd_structure.hxx |
|---|
| 2 | * \brief Adds support for protein structure prediction, comparison of two |
|---|
| 3 | * protein secondary structures and of amino acid sequences with protein |
|---|
| 4 | * secondary structures as well as visualization of the match quality in EDIT4. |
|---|
| 5 | * \author Markus Urban |
|---|
| 6 | * \date 2008-02-08 |
|---|
| 7 | * |
|---|
| 8 | * This file contains functions that predict a protein secondary structure from |
|---|
| 9 | * its primary structure (i.e. the amino acid sequence) and for visualizing |
|---|
| 10 | * how good a sequence matches a given secondary structure. Two secondary |
|---|
| 11 | * structures can be compared, too. The initial values for the match symbols |
|---|
| 12 | * and other settings are defined here, as well as functions that create a |
|---|
| 13 | * "Protein Match Settings" window allowing the user to change the default |
|---|
| 14 | * properties for match computation. |
|---|
| 15 | * |
|---|
| 16 | * \sa The functions for protein structure prediction are based on a statistical |
|---|
| 17 | * method known as Chou-Fasman algorithm. For details refer to "Chou, P. and |
|---|
| 18 | * Fasman, G. (1978). Prediction of the secondary structure of proteins from |
|---|
| 19 | * their amino acid sequence. Advanced Enzymology, 47, 45-148.". |
|---|
| 20 | * |
|---|
| 21 | * \attention The used method for secondary structure prediction is fast which |
|---|
| 22 | * was the main reason for choosing it. Performance is important |
|---|
| 23 | * for a large number of sequences loaded in the editor. However, it |
|---|
| 24 | * is not very accurate and should only be used as rough estimation. |
|---|
| 25 | * For our purpose, the algorithm as well as own adaptions to it are |
|---|
| 26 | * used to get an approximate overview if a given amino acid sequence |
|---|
| 27 | * does not match a certain secondary structure. |
|---|
| 28 | */ |
|---|
| 29 | |
|---|
| 30 | #ifndef ED4_PROTEIN_2ND_STRUCTURE_HXX |
|---|
| 31 | #define ED4_PROTEIN_2ND_STRUCTURE_HXX |
|---|
| 32 | |
|---|
| 33 | #ifndef AW_WINDOW_HXX |
|---|
| 34 | #include "aw_window.hxx" |
|---|
| 35 | #endif |
|---|
| 36 | |
|---|
| 37 | // #define SHOW_PROGRESS ///< Print information about progress to screen (for finding and extending structures and resolving overlaps). |
|---|
| 38 | |
|---|
| 39 | #define PFOLD_AWAR_ENABLE "Pfold/enable" //!< Enable structure match. |
|---|
| 40 | #define PFOLD_AWAR_SELECTED_SAI "Pfold/selected_SAI" //!< Selected reference protein secondary structure SAI (i.e. the SAI that is used for structure comparison). |
|---|
| 41 | #define PFOLD_AWAR_PAIR_TEMPLATE "Pfold/pairs/%s" //!< Structure pairs that define the match quality (see #pfold_pairs) as used for match methods #SECSTRUCT_SECSTRUCT and #SECSTRUCT_SEQUENCE_PREDICT. |
|---|
| 42 | #define PFOLD_AWAR_SYMBOL_TEMPLATE "Pfold/symbols/%s" //!< Symbols for the match quality as used for match methods #SECSTRUCT_SECSTRUCT and #SECSTRUCT_SEQUENCE_PREDICT. |
|---|
| 43 | #define PFOLD_AWAR_SYMBOL_TEMPLATE_2 "Pfold/symbols2" //!< Symbols for the match quality as used for match method #SECSTRUCT_SEQUENCE. |
|---|
| 44 | #define PFOLD_AWAR_MATCH_METHOD "Pfold/match_method" //!< Selected method for computing the match quality (see #PFOLD_MATCH_METHOD). |
|---|
| 45 | #define PFOLD_AWAR_SAI_FILTER "Pfold/SAI_filter" //!< Filter SAIs for given criteria (string); used in option menu for SAI selection. |
|---|
| 46 | |
|---|
| 47 | // TODO: move static variables to .cpp file? |
|---|
| 48 | |
|---|
| 49 | /*! \brief Protein secondary structure types. |
|---|
| 50 | * |
|---|
| 51 | * Defines the various types of protein secondary structure. The order |
|---|
| 52 | * (or at least the individual values) are important, because they are |
|---|
| 53 | * used to access various arrays. |
|---|
| 54 | */ |
|---|
| 55 | enum PFOLD_STRUCTURE { |
|---|
| 56 | ALPHA_HELIX = 0, //!< Alpha-helix |
|---|
| 57 | BETA_SHEET = 1, //!< Beta-sheet |
|---|
| 58 | BETA_TURN = 2, //!< Beta-turn |
|---|
| 59 | STRUCTURE_SUMMARY = 3, //!< Structure summary |
|---|
| 60 | // THREE_TURN = 4, ///< Three turn |
|---|
| 61 | // FOUR_TURN = 5, ///< Four turn |
|---|
| 62 | // FIVE_TURN = 6 ///< Five turn |
|---|
| 63 | }; |
|---|
| 64 | |
|---|
| 65 | //! Defines a name-value pair (e.g. for awars, menu entries, etc.). |
|---|
| 66 | struct name_value_pair { |
|---|
| 67 | const char *name; //!< Name or description |
|---|
| 68 | int value; //!< Value attached to \a name |
|---|
| 69 | }; |
|---|
| 70 | |
|---|
| 71 | //! Match quality for secondary structure match. |
|---|
| 72 | enum PFOLD_MATCH_TYPE { |
|---|
| 73 | STRUCT_PERFECT_MATCH, //!< Perfect match |
|---|
| 74 | STRUCT_GOOD_MATCH, //!< Good match |
|---|
| 75 | STRUCT_MEDIUM_MATCH, //!< Medium match |
|---|
| 76 | STRUCT_BAD_MATCH, //!< Bad match |
|---|
| 77 | STRUCT_NO_MATCH, //!< No match |
|---|
| 78 | STRUCT_UNKNOWN, //!< Unknown structure |
|---|
| 79 | PFOLD_MATCH_TYPE_COUNT //!< Number of match types |
|---|
| 80 | }; |
|---|
| 81 | |
|---|
| 82 | //! Awars for the match type; binds the #PFOLD_MATCH_TYPE to the corresponding awar name. |
|---|
| 83 | extern name_value_pair pfold_match_type_awars[]; |
|---|
| 84 | |
|---|
| 85 | #define PFOLD_PAIRS 6 |
|---|
| 86 | |
|---|
| 87 | //! Match pair definition (see #PFOLD_MATCH_TYPE) as used for match methods #SECSTRUCT_SECSTRUCT and #SECSTRUCT_SEQUENCE_PREDICT in ED4_pfold_calculate_secstruct_match(). |
|---|
| 88 | extern char *pfold_pairs[PFOLD_PAIRS]; |
|---|
| 89 | |
|---|
| 90 | //! Symbols for the match quality (defined by #PFOLD_MATCH_TYPE) as used for match methods #SECSTRUCT_SECSTRUCT and #SECSTRUCT_SEQUENCE_PREDICT in ED4_pfold_calculate_secstruct_match(). |
|---|
| 91 | extern char *pfold_pair_chars[PFOLD_PAIRS]; |
|---|
| 92 | |
|---|
| 93 | /*! \brief Symbols for the match quality as used for match method |
|---|
| 94 | * #SECSTRUCT_SEQUENCE in ED4_pfold_calculate_secstruct_match(). |
|---|
| 95 | * |
|---|
| 96 | * The ten symbols represent the match quality ranging from 0 - 100% in |
|---|
| 97 | * steps of 10%. |
|---|
| 98 | */ |
|---|
| 99 | |
|---|
| 100 | #define PFOLD_PAIR_CHARS_2 "##++~~-- " |
|---|
| 101 | |
|---|
| 102 | //! Defines the methods for match computation. For details refer to ED4_pfold_calculate_secstruct_match(). |
|---|
| 103 | enum PFOLD_MATCH_METHOD { |
|---|
| 104 | SECSTRUCT_SECSTRUCT, //!< Compare two protein secondary structures |
|---|
| 105 | SECSTRUCT_SEQUENCE, //!< Compare an amino acid sequence with a reference protein secondary structure |
|---|
| 106 | SECSTRUCT_SEQUENCE_PREDICT, //!< Compare a full prediction of the protein secondary structure from its amino acid sequence with a reference protein secondary structure |
|---|
| 107 | PFOLD_MATCH_METHOD_COUNT //!< Number of match methods |
|---|
| 108 | }; |
|---|
| 109 | |
|---|
| 110 | /*! \brief Returns the former value of an amino acid depending on the given structure type. |
|---|
| 111 | * |
|---|
| 112 | * The definition is used for method #SECSTRUCT_SEQUENCE in |
|---|
| 113 | * ED4_pfold_calculate_secstruct_match() to get the former value of an amino acid |
|---|
| 114 | * depending on the found structure type at its position. It addresses #cf_parameters |
|---|
| 115 | * for #ALPHA_HELIX and #BETA_SHEET and #cf_parameters_norm for #BETA_TURN. |
|---|
| 116 | */ |
|---|
| 117 | #define cf_former(aa, strct) ((strct!=2) ? cf_parameters[aa][strct] : cf_parameters_norm[aa][strct]) |
|---|
| 118 | |
|---|
| 119 | /*! \brief Returns the breaker value of an amino acid depending on the given structure type. |
|---|
| 120 | * |
|---|
| 121 | * The definition is used for method #SECSTRUCT_SEQUENCE in |
|---|
| 122 | * ED4_pfold_calculate_secstruct_match() to get the breaker value of an amino acid |
|---|
| 123 | * depending on the found structure type at its position. It addresses #cf_parameters |
|---|
| 124 | * for #ALPHA_HELIX and #BETA_SHEET and returns 0 for #BETA_SHEET, because it has no |
|---|
| 125 | * breaker values. |
|---|
| 126 | */ |
|---|
| 127 | #define cf_breaker(aa, strct) ((strct!=2) ? cf_parameters[aa][strct+2] : 0) |
|---|
| 128 | |
|---|
| 129 | |
|---|
| 130 | // General ED4 functions ////////////////////////////////////////////////////////////////////////////////////////////////////////// |
|---|
| 131 | |
|---|
| 132 | |
|---|
| 133 | |
|---|
| 134 | /*! \brief Compares a protein secondary structure with a primary structure or another |
|---|
| 135 | * secondary structure. |
|---|
| 136 | * |
|---|
| 137 | * \param[in] structure_sai Reference protein structure SAI (secondary structure) |
|---|
| 138 | * \param[in] structure_cmp Protein structure to compare (primary or secondary structure) |
|---|
| 139 | * \param[in] start The start of the match computation (visible area in editor) |
|---|
| 140 | * \param[in] end The end of the match computation (visible area in editor) |
|---|
| 141 | * \param[out] result_buffer Result buffer for match symbols |
|---|
| 142 | * \param[in] match_method Method for structure match computation |
|---|
| 143 | * \return Error description, if an error occurred; 0 otherwise |
|---|
| 144 | * |
|---|
| 145 | * This function compares a protein secondary structure with a primary structure |
|---|
| 146 | * (= amino acid sequence) or another secondary structure depending on \a match_method. |
|---|
| 147 | * |
|---|
| 148 | * \par Match method SECSTRUCT_SECSTRUCT: |
|---|
| 149 | * Two secondary structures are compared one by one using the criteria defined |
|---|
| 150 | * by #pfold_pairs. The match symbols are taken from #pfold_pair_chars. |
|---|
| 151 | * |
|---|
| 152 | * \par Match method SECSTRUCT_SEQUENCE: |
|---|
| 153 | * An amino acid sequence is compared with a secondary structure by taking |
|---|
| 154 | * cohesive parts of the structure - gaps in the alignment are skipped - and |
|---|
| 155 | * computing the normalized difference of former and breaker values for this |
|---|
| 156 | * region in the given sequence such that a value from 0 - 100% for the |
|---|
| 157 | * match quality is generated. By dividing this value into steps of 10% |
|---|
| 158 | * it is mapped to the match symbols defined by #PFOLD_PAIR_CHARS_2. Note |
|---|
| 159 | * that bends ('S') are assumed to fit everywhere (=> best match symbol), and |
|---|
| 160 | * if a structure is encountered but no corresponding amino acid the worst match |
|---|
| 161 | * symbol is chosen. |
|---|
| 162 | * |
|---|
| 163 | * \par Match method SECSTRUCT_SEQUENCE_PREDICT: |
|---|
| 164 | * An amino acid sequence is compared with a secondary structure using a full |
|---|
| 165 | * prediction of the secondary structure from its sequence via |
|---|
| 166 | * ED4_pfold_predict_structure() and comparing it one by one with the reference |
|---|
| 167 | * structure. Note that not the structure summary is used for comparison, but |
|---|
| 168 | * the individual predicted structure types as returned in \a structures[4]. |
|---|
| 169 | * The match criteria are defined in #pfold_pairs which is searched in ascending |
|---|
| 170 | * order, i.e. good matches first, then the worse ones. If a match is found |
|---|
| 171 | * the corresponding match symbol (as defined by #pfold_pair_chars) is chosen. |
|---|
| 172 | * Note that if a structure is encountered but no corresponding amino acid the |
|---|
| 173 | * worst match symbol is chosen. |
|---|
| 174 | * |
|---|
| 175 | * The match criteria (for #SECSTRUCT_SECSTRUCT and #SECSTRUCT_SEQUENCE_PREDICT) as |
|---|
| 176 | * well as the match symbols (for all methods) can be adjusted by the user in the |
|---|
| 177 | * "Protein Match Settings" dialog. The result of the match computation (i.e. the |
|---|
| 178 | * match symbols) is written to the result buffer. |
|---|
| 179 | */ |
|---|
| 180 | GB_ERROR ED4_pfold_calculate_secstruct_match(const unsigned char *structure_sai, const unsigned char *structure_cmp, int start, int end, char *result_buffer, PFOLD_MATCH_METHOD match_method = SECSTRUCT_SEQUENCE); |
|---|
| 181 | |
|---|
| 182 | /*! \brief Sets the reference protein secondary structure SAI. |
|---|
| 183 | * |
|---|
| 184 | * \param[out] protstruct Pointer to reference protein secondary structure SAI |
|---|
| 185 | * \param[in] gb_main Main database |
|---|
| 186 | * \param[in] alignment_name Name of the alignment to search for |
|---|
| 187 | * \param[out] protstruct_len Length of reference protein secondary structure SAI |
|---|
| 188 | * \return Error description, if an error occurred; 0 otherwise |
|---|
| 189 | * |
|---|
| 190 | * The function searches the database \a gb_main for the currently selected SAI |
|---|
| 191 | * as defined by #PFOLD_AWAR_SELECTED_SAI and assigns the data of the alignment |
|---|
| 192 | * \a alignment_name to \a protstruct. If \a protstruct_len is specified the length |
|---|
| 193 | * of the new reference SAI is stored. The function is used in the editor to |
|---|
| 194 | * initialize the reference protein secondary structure SAI and to update it if the |
|---|
| 195 | * selected SAI is changed in the "Protein Match Settings" dialog. For this purpose |
|---|
| 196 | * it should be called with &ED4_ROOT->protstruct and &ED4_ROOT->protstruct_len. |
|---|
| 197 | */ |
|---|
| 198 | GB_ERROR ED4_pfold_set_SAI(char **protstruct, GBDATA *gb_main, const char *alignment_name, long *protstruct_len = NULp); |
|---|
| 199 | |
|---|
| 200 | |
|---|
| 201 | // AW related functions /////////////////////////////////////////////////////////////////////////////////////////////// |
|---|
| 202 | |
|---|
| 203 | /*! \brief Creates the "Protein Match Settings" window. |
|---|
| 204 | * |
|---|
| 205 | * \param[in] awr Root window |
|---|
| 206 | * \param[in] cb Callback struct |
|---|
| 207 | * \return Window |
|---|
| 208 | * |
|---|
| 209 | * The "Protein Match Settings" window allows the user to configure the properties |
|---|
| 210 | * for protein match computation. These settings include turning the match |
|---|
| 211 | * computation on and off (bound to awar #PFOLD_AWAR_ENABLE), selecting the reference |
|---|
| 212 | * protein secondary structure SAI (bound to awar #PFOLD_AWAR_SELECTED_SAI), choosing |
|---|
| 213 | * the match method (bound to awar #PFOLD_AWAR_MATCH_METHOD, see #PFOLD_MATCH_METHOD) |
|---|
| 214 | * and the definition of the match pairs (bound to awar #PFOLD_AWAR_PAIR_TEMPLATE |
|---|
| 215 | * and #pfold_match_type_awars, see #PFOLD_MATCH_TYPE and #pfold_pairs) as well as |
|---|
| 216 | * the match symbols (bound to awar #PFOLD_AWAR_SYMBOL_TEMPLATE and |
|---|
| 217 | * #pfold_match_type_awars or #PFOLD_AWAR_SYMBOL_TEMPLATE_2, see #PFOLD_MATCH_TYPE |
|---|
| 218 | * and #pfold_pair_chars or #PFOLD_PAIR_CHARS_2). Via a filter (bound to |
|---|
| 219 | * #PFOLD_AWAR_SAI_FILTER) the SAIs shown in the option menu can be narrowed down to |
|---|
| 220 | * a selection of SAIs whose names contain the specified string. The callback function |
|---|
| 221 | * #ED4_pfold_select_SAI_and_update_option_menu() is bound to the SAI option menu and |
|---|
| 222 | * the SAI filter to update the selected SAI in the editor or the selection in the |
|---|
| 223 | * SAI option menu. |
|---|
| 224 | */ |
|---|
| 225 | AW_window *ED4_pfold_create_props_window(AW_root *awr, const WindowCallback *refreshCallback); |
|---|
| 226 | |
|---|
| 227 | #endif |
|---|