1 | #include "muscle.h" |
---|
2 | #include "tree.h" |
---|
3 | #include "textfile.h" // for test only |
---|
4 | #include "msa.h" |
---|
5 | #include "seqvect.h" |
---|
6 | #include "profile.h" |
---|
7 | #ifndef _MSC_VER |
---|
8 | #include <unistd.h> // for unlink |
---|
9 | #endif |
---|
10 | |
---|
11 | #define TRACE 0 |
---|
12 | |
---|
13 | /*** |
---|
14 | Find subfamilies from tree by following criteria: |
---|
15 | |
---|
16 | (a) number of leaves <= max, |
---|
17 | (b) is monophyletic, i.e. most recent common ancestor is parent |
---|
18 | of no more than one subfamily. |
---|
19 | ***/ |
---|
20 | |
---|
21 | static unsigned SubFamRecurse(const Tree &tree, unsigned uNodeIndex, unsigned uMaxLeafCount, |
---|
22 | unsigned SubFams[], unsigned &uSubFamCount) |
---|
23 | { |
---|
24 | if (tree.IsLeaf(uNodeIndex)) |
---|
25 | return 1; |
---|
26 | |
---|
27 | unsigned uLeft = tree.GetLeft(uNodeIndex); |
---|
28 | unsigned uRight = tree.GetRight(uNodeIndex); |
---|
29 | unsigned uLeftCount = SubFamRecurse(tree, uLeft, uMaxLeafCount, SubFams, uSubFamCount); |
---|
30 | unsigned uRightCount = SubFamRecurse(tree, uRight, uMaxLeafCount, SubFams, uSubFamCount); |
---|
31 | |
---|
32 | unsigned uLeafCount = uLeftCount + uRightCount; |
---|
33 | if (uLeftCount + uRightCount > uMaxLeafCount) |
---|
34 | { |
---|
35 | if (uLeftCount <= uMaxLeafCount) |
---|
36 | SubFams[uSubFamCount++] = uLeft; |
---|
37 | if (uRightCount <= uMaxLeafCount) |
---|
38 | SubFams[uSubFamCount++] = uRight; |
---|
39 | } |
---|
40 | else if (tree.IsRoot(uNodeIndex)) |
---|
41 | { |
---|
42 | if (uSubFamCount != 0) |
---|
43 | Quit("Error in SubFamRecurse"); |
---|
44 | SubFams[uSubFamCount++] = uNodeIndex; |
---|
45 | } |
---|
46 | |
---|
47 | return uLeafCount; |
---|
48 | } |
---|
49 | |
---|
50 | void SubFam(const Tree &tree, unsigned uMaxLeafCount, unsigned SubFams[], unsigned *ptruSubFamCount) |
---|
51 | { |
---|
52 | *ptruSubFamCount = 0; |
---|
53 | SubFamRecurse(tree, tree.GetRootNodeIndex(), uMaxLeafCount, SubFams, *ptruSubFamCount); |
---|
54 | |
---|
55 | #if TRACE |
---|
56 | { |
---|
57 | Log("\n"); |
---|
58 | Log("Tree:\n"); |
---|
59 | tree.LogMe(); |
---|
60 | //void DrawTree(const Tree &tree); |
---|
61 | //DrawTree(tree); |
---|
62 | Log("\n"); |
---|
63 | Log("%d subfams:\n", *ptruSubFamCount); |
---|
64 | for (unsigned i = 0; i < *ptruSubFamCount; ++i) |
---|
65 | Log(" %d=%d", i, SubFams[i]); |
---|
66 | Log("\n"); |
---|
67 | } |
---|
68 | #endif |
---|
69 | } |
---|
70 | |
---|
71 | //unsigned SubFams[9999]; |
---|
72 | //unsigned uSubFamCount; |
---|
73 | // |
---|
74 | //static unsigned DistFromRoot(const Tree &tree, unsigned uNodeIndex) |
---|
75 | // { |
---|
76 | // const unsigned uRoot = tree.GetRootNodeIndex(); |
---|
77 | // unsigned uDist = 0; |
---|
78 | // while (uNodeIndex != uRoot) |
---|
79 | // { |
---|
80 | // ++uDist; |
---|
81 | // uNodeIndex = tree.GetParent(uNodeIndex); |
---|
82 | // } |
---|
83 | // return uDist; |
---|
84 | // } |
---|
85 | // |
---|
86 | //static void DrawNode(const Tree &tree, unsigned uNodeIndex) |
---|
87 | // { |
---|
88 | // if (!tree.IsLeaf(uNodeIndex)) |
---|
89 | // DrawNode(tree, tree.GetLeft(uNodeIndex)); |
---|
90 | // |
---|
91 | // unsigned uDist = DistFromRoot(tree, uNodeIndex); |
---|
92 | // for (unsigned i = 0; i < 5*uDist; ++i) |
---|
93 | // Log(" "); |
---|
94 | // Log("%d", uNodeIndex); |
---|
95 | // for (unsigned i = 0; i < uSubFamCount; ++i) |
---|
96 | // if (uNodeIndex == SubFams[i]) |
---|
97 | // { |
---|
98 | // Log("*"); |
---|
99 | // break; |
---|
100 | // } |
---|
101 | // Log("\n"); |
---|
102 | // |
---|
103 | // if (!tree.IsLeaf(uNodeIndex)) |
---|
104 | // DrawNode(tree, tree.GetRight(uNodeIndex)); |
---|
105 | // } |
---|
106 | // |
---|
107 | //static void DrawTree(const Tree &tree) |
---|
108 | // { |
---|
109 | // unsigned uRoot = tree.GetRootNodeIndex(); |
---|
110 | // DrawNode(tree, uRoot); |
---|
111 | // } |
---|
112 | // |
---|
113 | //void TestSubFams(const char *FileName) |
---|
114 | // { |
---|
115 | // Tree tree; |
---|
116 | // TextFile f(FileName); |
---|
117 | // tree.FromFile(f); |
---|
118 | // SubFam(tree, 5, SubFams, &uSubFamCount); |
---|
119 | // DrawTree(tree); |
---|
120 | // } |
---|
121 | |
---|
122 | static void SetInFam(const Tree &tree, unsigned uNodeIndex, bool NodeInSubFam[]) |
---|
123 | { |
---|
124 | if (tree.IsLeaf(uNodeIndex)) |
---|
125 | return; |
---|
126 | unsigned uLeft = tree.GetLeft(uNodeIndex); |
---|
127 | unsigned uRight = tree.GetRight(uNodeIndex); |
---|
128 | NodeInSubFam[uLeft] = true; |
---|
129 | NodeInSubFam[uRight] = true; |
---|
130 | |
---|
131 | SetInFam(tree, uLeft, NodeInSubFam); |
---|
132 | SetInFam(tree, uRight, NodeInSubFam); |
---|
133 | } |
---|
134 | |
---|
135 | void AlignSubFam(SeqVect &vAll, const Tree &GuideTree, unsigned uNodeIndex, |
---|
136 | MSA &msaOut) |
---|
137 | { |
---|
138 | const unsigned uSeqCount = vAll.GetSeqCount(); |
---|
139 | |
---|
140 | const char *InTmp = "asf_in.tmp"; |
---|
141 | const char *OutTmp = "asf_out.tmp"; |
---|
142 | |
---|
143 | unsigned *Leaves = new unsigned[uSeqCount]; |
---|
144 | unsigned uLeafCount; |
---|
145 | GetLeaves(GuideTree, uNodeIndex, Leaves, &uLeafCount); |
---|
146 | |
---|
147 | SeqVect v; |
---|
148 | for (unsigned i = 0; i < uLeafCount; ++i) |
---|
149 | { |
---|
150 | unsigned uLeafNodeIndex = Leaves[i]; |
---|
151 | unsigned uId = GuideTree.GetLeafId(uLeafNodeIndex); |
---|
152 | Seq &s = vAll.GetSeqById(uId); |
---|
153 | v.AppendSeq(s); |
---|
154 | } |
---|
155 | |
---|
156 | #if TRACE |
---|
157 | { |
---|
158 | Log("Align subfam[node=%d, size=%d] ", uNodeIndex, uLeafCount); |
---|
159 | for (unsigned i = 0; i < uLeafCount; ++i) |
---|
160 | Log(" %s", v.GetSeqName(i)); |
---|
161 | Log("\n"); |
---|
162 | } |
---|
163 | #endif |
---|
164 | |
---|
165 | TextFile fIn(InTmp, true); |
---|
166 | |
---|
167 | v.ToFASTAFile(fIn); |
---|
168 | fIn.Close(); |
---|
169 | |
---|
170 | char CmdLine[4096]; |
---|
171 | sprintf(CmdLine, "probcons %s > %s 2> /dev/null", InTmp, OutTmp); |
---|
172 | // sprintf(CmdLine, "muscle -in %s -out %s -maxiters 1", InTmp, OutTmp); |
---|
173 | int NotUsed = system(CmdLine); |
---|
174 | |
---|
175 | TextFile fOut(OutTmp); |
---|
176 | msaOut.FromFile(fOut); |
---|
177 | |
---|
178 | for (unsigned uSeqIndex = 0; uSeqIndex < uLeafCount; ++uSeqIndex) |
---|
179 | { |
---|
180 | const char *Name = msaOut.GetSeqName(uSeqIndex); |
---|
181 | unsigned uId = vAll.GetSeqIdFromName(Name); |
---|
182 | msaOut.SetSeqId(uSeqIndex, uId); |
---|
183 | } |
---|
184 | |
---|
185 | unlink(InTmp); |
---|
186 | unlink(OutTmp); |
---|
187 | |
---|
188 | delete[] Leaves; |
---|
189 | } |
---|
190 | |
---|
191 | void ProgAlignSubFams() |
---|
192 | { |
---|
193 | MSA msaOut; |
---|
194 | |
---|
195 | SetOutputFileName(g_pstrOutFileName); |
---|
196 | SetInputFileName(g_pstrInFileName); |
---|
197 | |
---|
198 | SetMaxIters(g_uMaxIters); |
---|
199 | SetSeqWeightMethod(g_SeqWeight1); |
---|
200 | |
---|
201 | TextFile fileIn(g_pstrInFileName); |
---|
202 | SeqVect v; |
---|
203 | v.FromFASTAFile(fileIn); |
---|
204 | const unsigned uSeqCount = v.Length(); |
---|
205 | |
---|
206 | if (0 == uSeqCount) |
---|
207 | Quit("No sequences in input file"); |
---|
208 | |
---|
209 | ALPHA Alpha = ALPHA_Undefined; |
---|
210 | switch (g_SeqType) |
---|
211 | { |
---|
212 | case SEQTYPE_Auto: |
---|
213 | Alpha = v.GuessAlpha(); |
---|
214 | break; |
---|
215 | |
---|
216 | case SEQTYPE_Protein: |
---|
217 | Alpha = ALPHA_Amino; |
---|
218 | break; |
---|
219 | |
---|
220 | case SEQTYPE_DNA: |
---|
221 | Alpha = ALPHA_DNA; |
---|
222 | break; |
---|
223 | |
---|
224 | case SEQTYPE_RNA: |
---|
225 | Alpha = ALPHA_RNA; |
---|
226 | break; |
---|
227 | |
---|
228 | default: |
---|
229 | Quit("Invalid seq type"); |
---|
230 | } |
---|
231 | SetAlpha(Alpha); |
---|
232 | v.FixAlpha(); |
---|
233 | |
---|
234 | PTR_SCOREMATRIX UserMatrix = 0; |
---|
235 | if (0 != g_pstrMatrixFileName) |
---|
236 | { |
---|
237 | const char *FileName = g_pstrMatrixFileName; |
---|
238 | const char *Path = getenv("MUSCLE_MXPATH"); |
---|
239 | if (Path != 0) |
---|
240 | { |
---|
241 | size_t n = strlen(Path) + 1 + strlen(FileName) + 1; |
---|
242 | char *NewFileName = new char[n]; |
---|
243 | sprintf(NewFileName, "%s/%s", Path, FileName); |
---|
244 | FileName = NewFileName; |
---|
245 | } |
---|
246 | TextFile File(FileName); |
---|
247 | UserMatrix = ReadMx(File); |
---|
248 | g_Alpha = ALPHA_Amino; |
---|
249 | g_PPScore = PPSCORE_SP; |
---|
250 | } |
---|
251 | |
---|
252 | SetPPScore(); |
---|
253 | |
---|
254 | if (0 != UserMatrix) |
---|
255 | g_ptrScoreMatrix = UserMatrix; |
---|
256 | |
---|
257 | if (ALPHA_DNA == Alpha || ALPHA_RNA == Alpha) |
---|
258 | { |
---|
259 | SetPPScore(PPSCORE_SPN); |
---|
260 | g_Distance1 = DISTANCE_Kmer4_6; |
---|
261 | } |
---|
262 | |
---|
263 | unsigned uMaxL = 0; |
---|
264 | unsigned uTotL = 0; |
---|
265 | for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) |
---|
266 | { |
---|
267 | unsigned L = v.GetSeq(uSeqIndex).Length(); |
---|
268 | uTotL += L; |
---|
269 | if (L > uMaxL) |
---|
270 | uMaxL = L; |
---|
271 | } |
---|
272 | |
---|
273 | SetIter(1); |
---|
274 | g_bDiags = g_bDiags1; |
---|
275 | SetSeqStats(uSeqCount, uMaxL, uTotL/uSeqCount); |
---|
276 | |
---|
277 | SetMuscleSeqVect(v); |
---|
278 | |
---|
279 | MSA::SetIdCount(uSeqCount); |
---|
280 | |
---|
281 | // Initialize sequence ids. |
---|
282 | // From this point on, ids must somehow propogate from here. |
---|
283 | for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) |
---|
284 | v.SetSeqId(uSeqIndex, uSeqIndex); |
---|
285 | |
---|
286 | if (uSeqCount > 1) |
---|
287 | MHackStart(v); |
---|
288 | |
---|
289 | if (0 == uSeqCount) |
---|
290 | { |
---|
291 | msaOut.Clear(); |
---|
292 | return; |
---|
293 | } |
---|
294 | |
---|
295 | if (1 == uSeqCount && ALPHA_Amino == Alpha) |
---|
296 | { |
---|
297 | const Seq &s = v.GetSeq(0); |
---|
298 | msaOut.FromSeq(s); |
---|
299 | return; |
---|
300 | } |
---|
301 | |
---|
302 | Tree GuideTree; |
---|
303 | TreeFromSeqVect(v, GuideTree, g_Cluster1, g_Distance1, g_Root1); |
---|
304 | SetMuscleTree(GuideTree); |
---|
305 | |
---|
306 | MSA msa; |
---|
307 | if (g_bLow) |
---|
308 | { |
---|
309 | ProgNode *ProgNodes = 0; |
---|
310 | ProgNodes = ProgressiveAlignE(v, GuideTree, msa); |
---|
311 | delete[] ProgNodes; |
---|
312 | } |
---|
313 | else |
---|
314 | ProgressiveAlign(v, GuideTree, msa); |
---|
315 | SetCurrentAlignment(msa); |
---|
316 | TreeFromMSA(msa, GuideTree, g_Cluster2, g_Distance2, g_Root2); |
---|
317 | SetMuscleTree(GuideTree); |
---|
318 | |
---|
319 | unsigned *SubFams = new unsigned[uSeqCount]; |
---|
320 | unsigned uSubFamCount; |
---|
321 | SubFam(GuideTree, g_uMaxSubFamCount, SubFams, &uSubFamCount); |
---|
322 | |
---|
323 | SetProgressDesc("Align node"); |
---|
324 | const unsigned uNodeCount = 2*uSeqCount - 1; |
---|
325 | |
---|
326 | ProgNode *ProgNodes = new ProgNode[uNodeCount]; |
---|
327 | bool *NodeIsSubFam = new bool[uNodeCount]; |
---|
328 | bool *NodeInSubFam = new bool[uNodeCount]; |
---|
329 | |
---|
330 | for (unsigned i = 0; i < uNodeCount; ++i) |
---|
331 | { |
---|
332 | NodeIsSubFam[i] = false; |
---|
333 | NodeInSubFam[i] = false; |
---|
334 | } |
---|
335 | |
---|
336 | for (unsigned i = 0; i < uSubFamCount; ++i) |
---|
337 | { |
---|
338 | unsigned uNodeIndex = SubFams[i]; |
---|
339 | assert(uNodeIndex < uNodeCount); |
---|
340 | NodeIsSubFam[uNodeIndex] = true; |
---|
341 | SetInFam(GuideTree, uNodeIndex, NodeInSubFam); |
---|
342 | } |
---|
343 | |
---|
344 | unsigned uJoin = 0; |
---|
345 | unsigned uTreeNodeIndex = GuideTree.FirstDepthFirstNode(); |
---|
346 | do |
---|
347 | { |
---|
348 | if (NodeIsSubFam[uTreeNodeIndex]) |
---|
349 | { |
---|
350 | #if TRACE |
---|
351 | Log("Node %d: align subfam\n", uTreeNodeIndex); |
---|
352 | #endif |
---|
353 | ProgNode &Node = ProgNodes[uTreeNodeIndex]; |
---|
354 | AlignSubFam(v, GuideTree, uTreeNodeIndex, Node.m_MSA); |
---|
355 | Node.m_uLength = Node.m_MSA.GetColCount(); |
---|
356 | } |
---|
357 | else if (!NodeInSubFam[uTreeNodeIndex]) |
---|
358 | { |
---|
359 | #if TRACE |
---|
360 | Log("Node %d: align two subfams\n", uTreeNodeIndex); |
---|
361 | #endif |
---|
362 | Progress(uJoin, uSubFamCount - 1); |
---|
363 | ++uJoin; |
---|
364 | |
---|
365 | const unsigned uMergeNodeIndex = uTreeNodeIndex; |
---|
366 | ProgNode &Parent = ProgNodes[uMergeNodeIndex]; |
---|
367 | |
---|
368 | const unsigned uLeft = GuideTree.GetLeft(uTreeNodeIndex); |
---|
369 | const unsigned uRight = GuideTree.GetRight(uTreeNodeIndex); |
---|
370 | |
---|
371 | ProgNode &Node1 = ProgNodes[uLeft]; |
---|
372 | ProgNode &Node2 = ProgNodes[uRight]; |
---|
373 | |
---|
374 | PWPath Path; |
---|
375 | AlignTwoMSAs(Node1.m_MSA, Node2.m_MSA, Parent.m_MSA, Path); |
---|
376 | Parent.m_uLength = Parent.m_MSA.GetColCount(); |
---|
377 | |
---|
378 | Node1.m_MSA.Clear(); |
---|
379 | Node2.m_MSA.Clear(); |
---|
380 | } |
---|
381 | else |
---|
382 | { |
---|
383 | #if TRACE |
---|
384 | Log("Node %d: in subfam\n", uTreeNodeIndex); |
---|
385 | #endif |
---|
386 | ; |
---|
387 | } |
---|
388 | uTreeNodeIndex = GuideTree.NextDepthFirstNode(uTreeNodeIndex); |
---|
389 | } |
---|
390 | while (NULL_NEIGHBOR != uTreeNodeIndex); |
---|
391 | ProgressStepsDone(); |
---|
392 | |
---|
393 | unsigned uRootNodeIndex = GuideTree.GetRootNodeIndex(); |
---|
394 | ProgNode &RootProgNode = ProgNodes[uRootNodeIndex]; |
---|
395 | |
---|
396 | TextFile fOut(g_pstrOutFileName, true); |
---|
397 | MHackEnd(RootProgNode.m_MSA); |
---|
398 | RootProgNode.m_MSA.ToFile(fOut); |
---|
399 | |
---|
400 | delete[] NodeInSubFam; |
---|
401 | delete[] NodeIsSubFam; |
---|
402 | delete[] ProgNodes; |
---|
403 | delete[] SubFams; |
---|
404 | |
---|
405 | ProgNodes = 0; |
---|
406 | NodeInSubFam = 0; |
---|
407 | NodeIsSubFam = 0; |
---|
408 | SubFams = 0; |
---|
409 | } |
---|