source: tags/ms_r18q1/EDIT4/ed4_protein_2nd_structure.hxx

Last change on this file was 16763, checked in by westram, 6 years ago
  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 12.4 KB
Line 
1/*! \file   ed4_protein_2nd_structure.hxx
2 *  \brief  Adds support for protein structure prediction, comparison of two
3 *          protein secondary structures and of amino acid sequences with protein
4 *          secondary structures as well as visualization of the match quality in EDIT4.
5 *  \author Markus Urban
6 *  \date   2008-02-08
7 *
8 *  This file contains functions that predict a protein secondary structure from
9 *  its primary structure (i.e. the amino acid sequence) and for visualizing
10 *  how good a sequence matches a given secondary structure. Two secondary
11 *  structures can be compared, too. The initial values for the match symbols
12 *  and other settings are defined here, as well as functions that create a
13 *  "Protein Match Settings" window allowing the user to change the default
14 *  properties for match computation.
15 *
16 *  \sa The functions for protein structure prediction are based on a statistical
17 *      method known as Chou-Fasman algorithm. For details refer to "Chou, P. and
18 *      Fasman, G. (1978). Prediction of the secondary structure of proteins from
19 *      their amino acid sequence. Advanced Enzymology, 47, 45-148.".
20 *
21 *  \attention The used method for secondary structure prediction is fast which
22 *             was the main reason for choosing it. Performance is important
23 *             for a large number of sequences loaded in the editor. However, it
24 *             is not very accurate and should only be used as rough estimation.
25 *             For our purpose, the algorithm as well as own adaptions to it are
26 *             used to get an approximate overview if a given amino acid sequence
27 *             does not match a certain secondary structure.
28*/
29
30#ifndef ED4_PROTEIN_2ND_STRUCTURE_HXX
31#define ED4_PROTEIN_2ND_STRUCTURE_HXX
32
33#ifndef AW_WINDOW_HXX
34#include "aw_window.hxx"
35#endif
36
37// #define SHOW_PROGRESS ///< Print information about progress to screen (for finding and extending structures and resolving overlaps).
38
39#define PFOLD_AWAR_ENABLE            "Pfold/enable"       //!< Enable structure match.
40#define PFOLD_AWAR_SELECTED_SAI      "Pfold/selected_SAI" //!< Selected reference protein secondary structure SAI (i.e. the SAI that is used for structure comparison).
41#define PFOLD_AWAR_PAIR_TEMPLATE     "Pfold/pairs/%s"     //!< Structure pairs that define the match quality (see #pfold_pairs) as used for match methods #SECSTRUCT_SECSTRUCT and #SECSTRUCT_SEQUENCE_PREDICT.
42#define PFOLD_AWAR_SYMBOL_TEMPLATE   "Pfold/symbols/%s"   //!< Symbols for the match quality as used for match methods #SECSTRUCT_SECSTRUCT and #SECSTRUCT_SEQUENCE_PREDICT.
43#define PFOLD_AWAR_SYMBOL_TEMPLATE_2 "Pfold/symbols2"     //!< Symbols for the match quality as used for match method #SECSTRUCT_SEQUENCE.
44#define PFOLD_AWAR_MATCH_METHOD      "Pfold/match_method" //!< Selected method for computing the match quality (see #PFOLD_MATCH_METHOD).
45#define PFOLD_AWAR_SAI_FILTER        "Pfold/SAI_filter"   //!< Filter SAIs for given criteria (string); used in option menu for SAI selection.
46
47// TODO: move static variables to .cpp file?
48
49/*! \brief Protein secondary structure types.
50 *
51 *  Defines the various types of protein secondary structure. The order
52 *  (or at least the individual values) are important, because they are
53 *  used to access various arrays.
54 */
55enum PFOLD_STRUCTURE {
56    ALPHA_HELIX       = 0, //!< Alpha-helix
57    BETA_SHEET        = 1, //!< Beta-sheet
58    BETA_TURN         = 2, //!< Beta-turn
59    STRUCTURE_SUMMARY = 3, //!< Structure summary
60//  THREE_TURN        = 4, ///< Three turn
61//  FOUR_TURN         = 5, ///< Four turn
62//  FIVE_TURN         = 6  ///< Five turn
63};
64
65//! Defines a name-value pair (e.g. for awars, menu entries, etc.).
66struct name_value_pair {
67    const char *name; //!< Name or description
68    int        value; //!< Value attached to \a name
69};
70
71//! Match quality for secondary structure match.
72enum PFOLD_MATCH_TYPE {
73    STRUCT_PERFECT_MATCH,  //!< Perfect match
74    STRUCT_GOOD_MATCH,     //!< Good match
75    STRUCT_MEDIUM_MATCH,   //!< Medium match
76    STRUCT_BAD_MATCH,      //!< Bad match
77    STRUCT_NO_MATCH,       //!< No match
78    STRUCT_UNKNOWN,        //!< Unknown structure
79    PFOLD_MATCH_TYPE_COUNT //!< Number of match types
80};
81
82//! Awars for the match type; binds the #PFOLD_MATCH_TYPE to the corresponding awar name.
83extern name_value_pair pfold_match_type_awars[];
84
85#define PFOLD_PAIRS 6
86
87//! Match pair definition (see #PFOLD_MATCH_TYPE) as used for match methods #SECSTRUCT_SECSTRUCT and #SECSTRUCT_SEQUENCE_PREDICT in ED4_pfold_calculate_secstruct_match().
88extern char *pfold_pairs[PFOLD_PAIRS];
89
90//! Symbols for the match quality (defined by #PFOLD_MATCH_TYPE) as used for match methods #SECSTRUCT_SECSTRUCT and #SECSTRUCT_SEQUENCE_PREDICT in ED4_pfold_calculate_secstruct_match().
91extern char *pfold_pair_chars[PFOLD_PAIRS];
92
93/*! \brief Symbols for the match quality as used for match method
94 *         #SECSTRUCT_SEQUENCE in ED4_pfold_calculate_secstruct_match().
95 *
96 *  The ten symbols represent the match quality ranging from 0 - 100% in
97 *  steps of 10%.
98 */
99
100#define PFOLD_PAIR_CHARS_2 "##++~~--  "
101
102//! Defines the methods for match computation. For details refer to ED4_pfold_calculate_secstruct_match().
103enum PFOLD_MATCH_METHOD {
104    SECSTRUCT_SECSTRUCT,        //!< Compare two protein secondary structures
105    SECSTRUCT_SEQUENCE,         //!< Compare an amino acid sequence with a reference protein secondary structure
106    SECSTRUCT_SEQUENCE_PREDICT, //!< Compare a full prediction of the protein secondary structure from its amino acid sequence with a reference protein secondary structure
107    PFOLD_MATCH_METHOD_COUNT    //!< Number of match methods
108};
109
110/*! \brief Returns the former value of an amino acid depending on the given structure type.
111 *
112 *  The definition is used for method #SECSTRUCT_SEQUENCE in
113 *  ED4_pfold_calculate_secstruct_match() to get the former value of an amino acid
114 *  depending on the found structure type at its position. It addresses #cf_parameters
115 *  for #ALPHA_HELIX and #BETA_SHEET and #cf_parameters_norm for #BETA_TURN.
116 */
117#define cf_former(aa, strct) ((strct!=2) ? cf_parameters[aa][strct] : cf_parameters_norm[aa][strct])
118
119/*! \brief Returns the breaker value of an amino acid depending on the given structure type.
120 *
121 *  The definition is used for method #SECSTRUCT_SEQUENCE in
122 *  ED4_pfold_calculate_secstruct_match() to get the breaker value of an amino acid
123 *  depending on the found structure type at its position. It addresses #cf_parameters
124 *  for #ALPHA_HELIX and #BETA_SHEET and returns 0 for #BETA_SHEET, because it has no
125 *  breaker values.
126 */
127#define cf_breaker(aa, strct) ((strct!=2) ? cf_parameters[aa][strct+2] : 0)
128
129
130// General ED4 functions //////////////////////////////////////////////////////////////////////////////////////////////////////////
131
132
133
134/*! \brief Compares a protein secondary structure with a primary structure or another
135 *         secondary structure.
136 *
137 *  \param[in]  structure_sai Reference protein structure SAI (secondary structure)
138 *  \param[in]  structure_cmp Protein structure to compare (primary or secondary structure)
139 *  \param[in]  start         The start of the match computation (visible area in editor)
140 *  \param[in]  end           The end of the match computation (visible area in editor)
141 *  \param[out] result_buffer Result buffer for match symbols
142 *  \param[in]  match_method  Method for structure match computation
143 *  \return     Error description, if an error occurred; 0 otherwise
144 *
145 *  This function compares a protein secondary structure with a primary structure
146 *  (= amino acid sequence) or another secondary structure depending on \a match_method.
147 *
148 *  \par Match method SECSTRUCT_SECSTRUCT:
149 *       Two secondary structures are compared one by one using the criteria defined
150 *       by #pfold_pairs. The match symbols are taken from #pfold_pair_chars.
151 *
152 *  \par Match method SECSTRUCT_SEQUENCE:
153 *       An amino acid sequence is compared with a secondary structure by taking
154 *       cohesive parts of the structure - gaps in the alignment are skipped - and
155 *       computing the normalized difference of former and breaker values for this
156 *       region in the given sequence such that a value from 0 - 100% for the
157 *       match quality is generated. By dividing this value into steps of 10%
158 *       it is mapped to the match symbols defined by #PFOLD_PAIR_CHARS_2. Note
159 *       that bends ('S') are assumed to fit everywhere (=> best match symbol), and
160 *       if a structure is encountered but no corresponding amino acid the worst match
161 *       symbol is chosen.
162 *
163 *  \par Match method SECSTRUCT_SEQUENCE_PREDICT:
164 *       An amino acid sequence is compared with a secondary structure using a full
165 *       prediction of the secondary structure from its sequence via
166 *       ED4_pfold_predict_structure() and comparing it one by one with the reference
167 *       structure. Note that not the structure summary is used for comparison, but
168 *       the individual predicted structure types as returned in \a structures[4].
169 *       The match criteria are defined in #pfold_pairs which is searched in ascending
170 *       order, i.e. good matches first, then the worse ones. If a match is found
171 *       the corresponding match symbol (as defined by #pfold_pair_chars) is chosen.
172 *       Note that if a structure is encountered but no corresponding amino acid the
173 *       worst match symbol is chosen.
174 *
175 *  The match criteria (for #SECSTRUCT_SECSTRUCT and #SECSTRUCT_SEQUENCE_PREDICT) as
176 *  well as the match symbols (for all methods) can be adjusted by the user in the
177 *  "Protein Match Settings" dialog. The result of the match computation (i.e. the
178 *  match symbols) is written to the result buffer.
179 */
180GB_ERROR ED4_pfold_calculate_secstruct_match(const unsigned char *structure_sai, const unsigned char *structure_cmp, int start, int end, char *result_buffer, PFOLD_MATCH_METHOD match_method = SECSTRUCT_SEQUENCE);
181
182/*! \brief Sets the reference protein secondary structure SAI.
183 *
184 *  \param[out] protstruct     Pointer to reference protein secondary structure SAI
185 *  \param[in]  gb_main        Main database
186 *  \param[in]  alignment_name Name of the alignment to search for
187 *  \param[out] protstruct_len Length of reference protein secondary structure SAI
188 *  \return     Error description, if an error occurred; 0 otherwise
189 *
190 *  The function searches the database \a gb_main for the currently selected SAI
191 *  as defined by #PFOLD_AWAR_SELECTED_SAI and assigns the data of the alignment
192 *  \a alignment_name to \a protstruct. If \a protstruct_len is specified the length
193 *  of the new reference SAI is stored. The function is used in the editor to
194 *  initialize the reference protein secondary structure SAI and to update it if the
195 *  selected SAI is changed in the "Protein Match Settings" dialog. For this purpose
196 *  it should be called with &ED4_ROOT->protstruct and &ED4_ROOT->protstruct_len.
197 */
198GB_ERROR ED4_pfold_set_SAI(char **protstruct, GBDATA *gb_main, const char *alignment_name, long *protstruct_len = NULp);
199
200
201// AW related functions ///////////////////////////////////////////////////////////////////////////////////////////////
202
203/*! \brief Creates the "Protein Match Settings" window.
204 *
205 *  \param[in] awr   Root window
206 *  \param[in] cb    Callback struct
207 *  \return    Window
208 *
209 *  The "Protein Match Settings" window allows the user to configure the properties
210 *  for protein match computation. These settings include turning the match
211 *  computation on and off (bound to awar #PFOLD_AWAR_ENABLE), selecting the reference
212 *  protein secondary structure SAI (bound to awar #PFOLD_AWAR_SELECTED_SAI), choosing
213 *  the match method (bound to awar #PFOLD_AWAR_MATCH_METHOD, see #PFOLD_MATCH_METHOD)
214 *  and the definition of the match pairs (bound to awar #PFOLD_AWAR_PAIR_TEMPLATE
215 *  and #pfold_match_type_awars, see #PFOLD_MATCH_TYPE and #pfold_pairs) as well as
216 *  the match symbols (bound to awar #PFOLD_AWAR_SYMBOL_TEMPLATE and
217 *  #pfold_match_type_awars or #PFOLD_AWAR_SYMBOL_TEMPLATE_2, see #PFOLD_MATCH_TYPE
218 *  and #pfold_pair_chars or #PFOLD_PAIR_CHARS_2). Via a filter (bound to
219 *  #PFOLD_AWAR_SAI_FILTER) the SAIs shown in the option menu can be narrowed down to
220 *  a selection of SAIs whose names contain the specified string. The callback function
221 *  #ED4_pfold_select_SAI_and_update_option_menu() is bound to the SAI option menu and
222 *  the SAI filter to update the selected SAI in the editor or the selection in the
223 *  SAI option menu.
224 */
225AW_window *ED4_pfold_create_props_window(AW_root *awr, const WindowCallback *refreshCallback);
226
227#endif
Note: See TracBrowser for help on using the repository browser.