source: tags/arb_5.1/EDIT4/ed4_protein_2nd_structure.hxx

Last change on this file was 5441, checked in by westram, 16 years ago
  • removed useless typedefs
  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 12.4 KB
Line 
1/** \file   ed4_protein_2nd_structure.hxx
2 *  \brief  Adds support for protein structure prediction, comparison of two
3 *          protein secondary structures and of amino acid sequences with protein
4 *          secondary structures as well as visualization of the match quality in EDIT4.
5 *  \author Markus Urban
6 *  \date   2008-02-08
7 *
8 *  This file contains functions that predict a protein secondary structure from
9 *  its primary structure (i.e. the amino acid sequence) and for visualizing
10 *  how good a sequence matches a given secondary structure. Two secondary
11 *  structures can be compared, too. The initial values for the match symbols
12 *  and other settings are defined here, as well as functions that create a
13 *  "Protein Match Settings" window allowing the user to change the default
14 *  properties for match computation.
15 *
16 *  \sa The functions for protein structure prediction are based on a statistical
17 *      method known as Chou-Fasman algorithm. For details refer to "Chou, P. and
18 *      Fasman, G. (1978). Prediction of the secondary structure of proteins from
19 *      their amino acid sequence. Advanced Enzymology, 47, 45-148.".
20 *
21 *  \attention The used method for secondary structure prediciton is fast which
22 *             was the main reason for choosing it. Performance is important
23 *             for a large number of sequences loaded in the editor. However, it
24 *             is not very accurate and should only be used as rough estimation.
25 *             For our purpose, the algorithm as well as own adaptions to it are
26 *             used to get an approximate overview if a given amino acid sequence
27 *             does not match a certain secondary structure.
28*/
29
30#ifndef ED4_PROTEIN_2ND_STRUCTURE_HXX
31#define ED4_PROTEIN_2ND_STRUCTURE_HXX
32
33#ifndef AW_WINDOW_HXX
34#include "aw_window.hxx"
35#endif
36
37//#define SHOW_PROGRESS ///< Print information about progress to screen (for finding and extending structures and resolving overlaps).
38
39#define PFOLD_AWAR_ENABLE            "Pfold/enable"       ///< Enable structure match.
40#define PFOLD_AWAR_SELECTED_SAI      "Pfold/selected_SAI" ///< Selected reference protein secondary structure SAI (i.e. the SAI that is used for structure comparison).
41#define PFOLD_AWAR_PAIR_TEMPLATE     "Pfold/pairs/%s"     ///< Structure pairs that define the match quality (see #pfold_pairs) as used for match methods #SECSTRUCT_SECSTRUCT and #SECSTRUCT_SEQUENCE_PREDICT.
42#define PFOLD_AWAR_SYMBOL_TEMPLATE   "Pfold/symbols/%s"   ///< Symbols for the match quality as used for match methods #SECSTRUCT_SECSTRUCT and #SECSTRUCT_SEQUENCE_PREDICT.
43#define PFOLD_AWAR_SYMBOL_TEMPLATE_2 "Pfold/symbols2"     ///< Symbols for the match quality as used for match method #SECSTRUCT_SEQUENCE.
44#define PFOLD_AWAR_MATCH_METHOD      "Pfold/match_method" ///< Selected method for computing the match quality (see #PFOLD_MATCH_METHOD).
45#define PFOLD_AWAR_SAI_FILTER        "Pfold/SAI_filter"   ///< Filter SAIs for given criteria (string); used in option menu for SAI selection.
46
47//TODO: move satic vaiables to .cpp file?
48
49/** \brief Protein secondary structure types.
50 *
51 *  Defines the various types of protein secondary structure. The order
52 *  (or at least the individual values) are important, because they are
53 *  used to access various arrays.
54 */
55enum PFOLD_STRUCTURE {
56    ALPHA_HELIX       = 0, ///< Alpha-helix
57    BETA_SHEET        = 1, ///< Beta-sheet
58    BETA_TURN         = 2, ///< Beta-turn
59    STRUCTURE_SUMMARY = 3, ///< Structure summary
60//  THREE_TURN        = 4, ///< Three turn
61//  FOUR_TURN         = 5, ///< Four turn
62//  FIVE_TURN         = 6  ///< Five turn
63};
64
65/// Defines a name-value pair (e.g. for awars, menu entries, etc.).
66struct name_value_pair {
67    const char *name; ///< Name or description
68    int        value; ///< Value attached to \a name
69};
70
71/// Match quality for secondary structure match.
72typedef enum {
73    STRUCT_PERFECT_MATCH,  ///< Perfect match
74    STRUCT_GOOD_MATCH,     ///< Good match
75    STRUCT_MEDIUM_MATCH,   ///< Medium match
76    STRUCT_BAD_MATCH,      ///< Bad match
77    STRUCT_NO_MATCH,       ///< No match
78    STRUCT_UNKNOWN,        ///< Unknown structure
79    PFOLD_MATCH_TYPE_COUNT ///< Number of match types
80} PFOLD_MATCH_TYPE;
81
82/// Awars for the match type; binds the #PFOLD_MATCH_TYPE to the corresponding awar name.
83extern name_value_pair pfold_match_type_awars[];
84
85/// Match pair definition (see #PFOLD_MATCH_TYPE) as used for match methods #SECSTRUCT_SECSTRUCT and #SECSTRUCT_SEQUENCE_PREDICT in ED4_pfold_calculate_secstruct_match().
86extern char *pfold_pairs[6];
87
88/// Symbols for the match quality (defined by #PFOLD_MATCH_TYPE) as used for match methods #SECSTRUCT_SECSTRUCT and #SECSTRUCT_SEQUENCE_PREDICT in ED4_pfold_calculate_secstruct_match().
89extern char *pfold_pair_chars[6];
90
91/** \brief Symbols for the match quality as used for match method
92 *         #SECSTRUCT_SEQUENCE in ED4_pfold_calculate_secstruct_match().
93 *
94 *  The ten symbols represent the match quality ranging from 0 - 100% in
95 *  steps of 10%.
96 */
97
98#define PFOLD_PAIR_CHARS_2 "##++~~--  "
99// static char pfold_pair_chars_2[] = "##++~~--  ";
100
101/// Defines the methods for match computation. For details refer to ED4_pfold_calculate_secstruct_match().
102typedef enum {
103    SECSTRUCT_SECSTRUCT,        ///< Compare two protein secondary structures
104    SECSTRUCT_SEQUENCE,         ///< Compare an amino acid sequence with a reference protein secondary structure
105    SECSTRUCT_SEQUENCE_PREDICT, ///< Compare a full prediction of the protein secondary structure from its amino acid sequence with a reference protein secondary structure
106    PFOLD_MATCH_METHOD_COUNT    ///< Number of match methods
107} PFOLD_MATCH_METHOD;
108
109/** \brief Returns the former value of an amino acid depending on the given structure type.
110 *
111 *  The definition is used for method #SECSTRUCT_SEQUENCE in
112 *  ED4_pfold_calculate_secstruct_match() to get the former value of an amino acid
113 *  depending on the found structure type at its position. It addresses #cf_parameters
114 *  for #ALPHA_HELIX and #BETA_SHEET and #cf_parameters_norm for #BETA_TURN.
115 */
116#define cf_former(aa,strct)  ( (strct!=2) ? cf_parameters[aa][strct] : cf_parameters_norm[aa][strct] )
117
118/** \brief Returns the breaker value of an amino acid depending on the given structure type.
119 *
120 *  The definition is used for method #SECSTRUCT_SEQUENCE in
121 *  ED4_pfold_calculate_secstruct_match() to get the breaker value of an amino acid
122 *  depending on the found structure type at its position. It addresses #cf_parameters
123 *  for #ALPHA_HELIX and #BETA_SHEET and returns 0 for #BETA_SHEET, because it has no
124 *  breaker values.
125 */
126#define cf_breaker(aa,strct) ( (strct!=2) ? cf_parameters[aa][strct+2] : 0 )
127
128
129// General ED4 functions //////////////////////////////////////////////////////////////////////////////////////////////////////////
130
131
132
133/** \brief Compares a protein secondary structure with a primary structure or another
134 *         secondary structure.
135 *
136 *  \param[in]  structure_sai Reference protein structure SAI (secondary structure)
137 *  \param[in]  structure_cmp Protein structure to compare (primary or secondary structure)
138 *  \param[in]  start         The start of the match computation (visible area in editor)
139 *  \param[in]  end           The end of the match computation (visible area in editor)
140 *  \param[out] result_buffer Result buffer for match symbols
141 *  \param[in]  match_method  Method for structure match computation
142 *  \return     Error description, if an error occured; 0 otherwise
143 *
144 *  This function compares a protein secondary structure with a primary structure
145 *  (= amino acid sequence) or another secondary structure depending on \a match_method.
146 *
147 *  \par Match method SECSTRUCT_SECSTRUCT:
148 *       Two secondary structures are compared one by one using the criteria defined
149 *       by #pfold_pairs. The match symbols are taken from #pfold_pair_chars.
150 *
151 *  \par Match method SECSTRUCT_SEQUENCE:
152 *       An amino acid sequence is compared with a secondary structure by taking
153 *       cohesive parts of the structure - gaps in the alignment are skipped - and
154 *       computing the normalized difference of former and breaker values for this
155 *       region in the given sequence such that a value from 0 - 100% for the
156 *       match quality is generated. By dividing this value into steps of 10%
157 *       it is mapped to the match symbols defined by #PFOLD_PAIR_CHARS_2. Note
158 *       that bends ('S') are assumed to fit everywhere (=> best match symbol), and
159 *       if a structure is encountered but no corresponding amino acid the worst match
160 *       symbol is chosen.
161 *
162 *  \par Match method SECSTRUCT_SEQUENCE_PREDICT:
163 *       An amino acid sequence is compared with a secondary structure using a full
164 *       prediction of the secondary structure from its sequence via
165 *       ED4_pfold_predict_structure() and comparing it one by one with the reference
166 *       structure. Note that not the structure summary is used for comparison, but
167 *       the individual predicted structure types as returned in \a structures[4].
168 *       The match criteria are defined in #pfold_pairs which is searched in ascending
169 *       order, i.e. good matches first, then the worse ones. If a match is found
170 *       the corresponding match symbol (as defined by #pfold_pair_chars) is chosen.
171 *       Note that if a structure is encountered but no corresponding amino acid the
172 *       worst match symbol is chosen.
173 *
174 *  The match criteria (for #SECSTRUCT_SECSTRUCT and #SECSTRUCT_SEQUENCE_PREDICT) as
175 *  well as the match symbols (for all methods) can be adjusted by the user in the
176 *  "Protein Match Settings" dialog. The result of the match computation (i.e. the
177 *  match symbols) is written to the result buffer.
178 */
179GB_ERROR ED4_pfold_calculate_secstruct_match(const unsigned char *structure_sai, const unsigned char *structure_cmp, int start, int end, char *result_buffer, PFOLD_MATCH_METHOD match_method = SECSTRUCT_SEQUENCE);
180
181/** \brief Sets the reference protein secondary structure SAI.
182 *
183 *  \param[out] protstruct     Pointer to reference protein secondary structure SAI
184 *  \param[in]  gb_main        Main database
185 *  \param[in]  alignment_name Name of the alignment to search for
186 *  \param[out] protstruct_len Length of reference protein secondary structure SAI
187 *  \return     Error description, if an error occured; 0 otherwise
188 *
189 *  The function searches the database \a gb_main for the currently selected SAI
190 *  as defined by #PFOLD_AWAR_SELECTED_SAI and assigns the data of the alignment
191 *  \a alignment_name to \a protstruct. If \a protstruct_len is specified the length
192 *  of the new reference SAI is stored. The function is used in the editor to
193 *  initialize the reference protein secondary structure SAI and to update it if the
194 *  selected SAI is changed in the "Protein Match Settings" dialog. For this purpose
195 *  it should be called with &ED4_ROOT->protstruct and &ED4_ROOT->protstruct_len.
196 */
197GB_ERROR ED4_pfold_set_SAI(char **protstruct, GBDATA *gb_main, const char *alignment_name, long *protstruct_len = 0);
198
199
200// AW related functions ///////////////////////////////////////////////////////////////////////////////////////////////
201
202/** \brief Creates the "Protein Match Settings" window.
203 *
204 *  \param[in] awr   Root window
205 *  \param[in] awcbs Callback struct
206 *  \return    Window
207 *
208 *  The "Protein Match Settings" window allows the user to configure the properties
209 *  for protein match computation. These settings include turning the match
210 *  computation on and off (bound to awar #PFOLD_AWAR_ENABLE), selecting the reference
211 *  protein secondary structure SAI (bound to awar #PFOLD_AWAR_SELECTED_SAI), choosing
212 *  the match method (bound to awar #PFOLD_AWAR_MATCH_METHOD, see #PFOLD_MATCH_METHOD)
213 *  and the definition of the match pairs (bound to awar #PFOLD_AWAR_PAIR_TEMPLATE
214 *  and #pfold_match_type_awars, see #PFOLD_MATCH_TYPE and #pfold_pairs) as well as
215 *  the match symbols (bound to awar #PFOLD_AWAR_SYMBOL_TEMPLATE and
216 *  #pfold_match_type_awars or #PFOLD_AWAR_SYMBOL_TEMPLATE_2, see #PFOLD_MATCH_TYPE
217 *  and #pfold_pair_chars or #PFOLD_PAIR_CHARS_2). Via a filter (bound to
218 *  #PFOLD_AWAR_SAI_FILTER) the SAIs shown in the option menu can be narrowed down to
219 *  a selection of SAIs whose names contain the specified string. The callback function
220 *  #ED4_pfold_select_SAI_and_update_option_menu() is bound to the SAI option menu and
221 *  the SAI filter to update the selected SAI in the editor or the selection in the
222 *  SAI option menu.
223 */
224AW_window *ED4_pfold_create_props_window(AW_root *awr, AW_cb_struct * /*owner*/awcbs);
225
226#endif
Note: See TracBrowser for help on using the repository browser.