libStatGen Software  1
SmithWaterman.h
1 /*
2  * Copyright (C) 2010 Regents of the University of Michigan
3  *
4  * This program is free software: you can redistribute it and/or modify
5  * it under the terms of the GNU General Public License as published by
6  * the Free Software Foundation, either version 3 of the License, or
7  * (at your option) any later version.
8  *
9  * This program is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12  * GNU General Public License for more details.
13  *
14  * You should have received a copy of the GNU General Public License
15  * along with this program. If not, see <http://www.gnu.org/licenses/>.
16  */
17 
18 #if !defined(_SMITH_WATERMAN_)
19 #define _SMITH_WATERMAN_
20 
21 #include <string.h> // for inline use of strcat, etc
22 #include <limits.h> // for INT_MAX
23 #include <stdint.h> // for uint32_t and friends
24 
25 //
26 // This file implements a bi-directional, banded Smith Waterman matching
27 // algorithm.
28 //
29 // The design is dictated by several observations:
30 //
31 // - building the full matrix H for the read and reference is slow,
32 // so we perform it only for a band down the diagonal, thus speeding
33 // up the algorithm at the cost of reduced detection of insert/deletes.
34 //
35 // - we must minimize data copying
36 //
37 // - we must have the ability to test the algorithm quickly and simply,
38 // hence we implement the class as a template so that this file doesn't
39 // have to depend on Karma's GenomeSequence object, which is a relatively
40 // heavy weight object to do testing against.
41 //
42 // - because Karma uses index words to determine match candidate locations
43 // across the genome, and because we use the banded Smith Waterman approach,
44 // we must provide bi-directional Smith Waterman matching. See example below.
45 //
46 // To fully understand the examples below, make sure you understand
47 // Phred quality scores, CIGAR strings, and pattern matching.
48 //
49 // Simple Functional Examples:
50 //
51 // Given a read, a read quality and a reference, we want to obtain some
52 // measure of how well that read matches the reference at that given location.
53 // So, we have:
54 // Read: "ATCG"
55 // Quality: "$$$$" (Phred scores - all are decimal 3)
56 // Reference: "ATCG"
57 // We expect: sumQ = 0, and Cigar "4M"
58 //
59 // Complex Functional Examples:
60 // Read: "AATCG"
61 // Quality: "$$$$$" (Phred scores - all are decimal 3)
62 // Reference: "ATCG"
63 // We expect: sumQ = 3, and Cigar "1M1I3M"
64 //
65 // Backwards matching:
66 // It is harder for me to construct a clear example, so imagine a read with
67 // cumulative inserts or deletes that sum up to a number larger than the
68 // width of the band along the diagonal. If we perform SW in the 'wrong'
69 // direction, we will lose the read entirely, whereas if we start from the
70 // end that is known to be matching, we may obtain a good match for the bulk
71 // of the read.
72 //
73 // For example, you could imagine a read where the first 10 bases had a mess
74 // of inserts, but then was clean for the next 100 bases. You'd want it
75 // to match as many of the good bases as practical, even if you knew you were
76 // losing information at the end.
77 //
78 
79 #include <assert.h>
80 #include <stdio.h>
81 #include <stdlib.h>
82 #include <string>
83 #include <iostream>
84 #include <iomanip>
85 #include <utility>
86 #include <vector>
87 
88 #include "CigarRoller.h"
89 #include "Generic.h"
90 
91 using std::cout;
92 using std::cin;
93 using std::cerr;
94 using std::setw;
95 using std::endl;
96 using std::pair;
97 using std::vector;
98 
99 #if !defined(MAX)
100 #define MAX(x,y) ((x)>(y) ? (x) : (y))
101 #endif
102 #if !defined(MIN)
103 #define MIN(x,y) ((x)<(y) ? (x) : (y))
104 #endif
105 
106 //
107 // Implement the core of Smith Waterman as described in:
108 // http://en.wikipedia.org/wiki/Smith_waterman
109 //
110 // The only variation from the basic SW algorithm is the
111 // use of a banded approach - to limit the algorithm to
112 // a band along the diagonal. This limits the maximum
113 // additive number of indels, but allows an O(c*M) approach,
114 // where c is the constant max number of indels allowed.
115 //
116 // This is implemented as function templates because for testing, it is easier
117 // to use character arrays. In our mapper, we will be using a
118 // combination of a String object for the read and the genome object
119 // as the reference. Both can be indexed, and give a character (base)
120 // value, but the code would be duplicated if we implement SW for
121 // each type of argument.
122 //
123 // Htype -> the type of the array H cell (8 or 16 bit unsigned int)
124 // Atype -> the read string type (must have Atype::operator [] defined)
125 //
126 template <int maxReadLengthH, int maxReferenceLengthH, typename HCellType, typename Atype, typename Btype, typename QualityType, typename readIndexType, typename referenceIndexType>
128 {
129 public:
130 
131  //
132  // XXX in theory, this weight should be sensitive
133  // to the quality of the base, and should have
134  // an appropriate cost for an indel, as well.
135  //
136  // I think we need to get rid of this, since it is
137  // basically wrong for our needs.
138  //
139  struct weight
140  {
141  weight()
142  {
143  match=2;
144  misMatch=-1;
145  insert=-1;
146  del=-1;
147  };
148 
149  int match;
150  int misMatch;
151  int insert;
152  int del;
153  };
154 
155  HCellType H[maxReadLengthH][maxReferenceLengthH];
156  Atype *A;
157  Btype *B;
158  QualityType *qualities;
159 
160  int m,n;
161  readIndexType MOffset; // constant offset to m (read)
162  referenceIndexType NOffset; // constant offset to n (reference)
163  weight w;
164  int allowedInsertDelete;
165  int direction;
166  int gapOpenCount;
167  int gapCloseCount;
168  int gapExtendCount;
169  vector<pair<int,int> > alignment;
170  void clearAlignment()
171  {
172  alignment.clear();
173  }
174 
175  HCellType maxCostValue; // max Cost value in H
176  pair<int,int> maxCostPosition; // row/col of max cost value in H
177 
178  //
179  // Clear the member variables only.
180  // To clear H, call clearH().
181  //
182  // In theory, clear() plus set() should be sufficient to
183  // get a clean run, but I haven't tested this extensively.
184  //
185  void clear()
186  {
187  maxCostPosition.first = 0;
188  maxCostPosition.second = 0;
189  A = NULL;
190  B = NULL;
191  qualities = NULL;
192  m = 0;
193  n = 0;
194  MOffset = 0;
195  NOffset = 0;
196  allowedInsertDelete = 0;
197  direction = 0;
198  gapOpenCount = 0;
199  gapCloseCount = 0;
200  gapExtendCount = 0;
201  }
202 
203  // caller will be using set* methods to set everything up.
204  SmithWaterman()
205  {
206  clear();
207  clearH();
208  }
209 
210  // construct with everything and the kitchen sink:
212  Atype *A,
213  QualityType *qualities,
214  Btype *B,
215  int m,
216  int n,
217  int allowedInsertDelete = INT_MAX,
218  int direction = 1,
219  readIndexType MOffset = 0,
220  referenceIndexType NOffset = 0):
221  A(A),
222  qualities(qualities),
223  B(B),
224  m(m),
225  n(n),
226  allowedInsertDelete(allowedInsertDelete),
227  direction(direction),
228  MOffset(MOffset),
229  NOffset(NOffset),
230  maxCostValue((HCellType) 0)
231  {
232  }
233 
234  void setRead(Atype *A)
235  {
236  this->A = A;
237  }
238  void setReadQuality(QualityType *qualities)
239  {
240  this->qualities = qualities;
241  }
242  void setReference(Btype *B)
243  {
244  this->B = B;
245  }
246 
247  // Caller may wish to index into the read to do the matching against
248  // only part of the read.
249  // NB: the quality length and offset are the same as the read.
250  void setReadLength(int m)
251  {
252  this->m = m;
253  }
254  void setReadOffset(readIndexType MOffset)
255  {
256  this->MOffset = MOffset;
257  }
258 
259  // The reference is typically long, and not necessarily a char *,
260  // so we provide an offset here. If it were always a char *,
261  // we'd just modify the caller to point directly at the reference
262  // location.
263  void setReferenceLength(int n)
264  {
265  this->n = n;
266  }
267  void setReferenceOffset(referenceIndexType NOffset)
268  {
269  this->NOffset = NOffset;
270  }
271 
272  //
273  // Configuration: how wide is the band on the diagonal?
274  // We should keep this small -- 1, 2, 3 or similar. If
275  // the value is default (INT_MAX), then the full matrix
276  // will be built, which is fine, but quite slow.
277  //
278  // If this paramater is made smaller than when a previous
279  // call to populateH was made, clearH will also need to be called.
280  //
281  void setAllowedInsertDelete(int allowedInsertDelete = INT_MAX)
282  {
283  this->allowedInsertDelete = allowedInsertDelete;
284  }
285 
286  //
287  // Configuration: which end do we begin performing SW matching
288  // from? We need this because of index 'anchors' in the karma
289  // matcher.
290  void setDirection(int direction)
291  {
292  this->direction = direction;
293  }
294 
295  void clearH()
296  {
297  memset(H, 0, sizeof(H));
298  }
299 
300  void populateH()
301  {
302 
303  maxCostValue = 0;
304 
305  for (int i=1; i<=m ; i++)
306  {
307 
308  // implement a banded Smith-Waterman approach:
309  int low = MAX(1, i - allowedInsertDelete);
310  int high = MIN(n, i + allowedInsertDelete);
311 
312  for (int j=low; j<=high ; j++)
313  {
314  HCellType c;
315  c = 0;
316  if (direction>0) c = MAX(c, H[i-1][j-1] + (((*A)[MOffset + i-1]==(*B)[NOffset + j-1]) ? w.match : w.misMatch));
317  else c = MAX(c, H[i-1][j-1] + (((*A)[MOffset + m-i+0]==(*B)[NOffset + n-j+0]) ? w.match : w.misMatch));
318  c = MAX(c, H[i-1][j] + w.del);
319  c = MAX(c, H[i][j-1] + w.insert);
320  H[i][j] = c;
321  if (c>maxCostValue)
322  {
323  maxCostValue = c;
324  maxCostPosition.first = i;
325  maxCostPosition.second = j;
326  }
327  }
328  }
329  }
330 
331 //
332 // Given the matrix H as filled in by above routine, print it out.
333 //
334  void printH(bool prettyPrint = true)
335  {
336  // print the scoring matrix:
337  for (int i=-1; i<=m ; i++)
338  {
339  for (int j=-1; j<=n ; j++)
340  {
341  if (prettyPrint) cout << setw(3);
342  if (i==-1 && j==-1)
343  {
344  if (prettyPrint) cout << " ";
345  else cout << "\t";
346  }
347  else if (j==-1)
348  {
349  if (!prettyPrint) cout << "\t";
350  if (i==0) cout << "-";
351  else cout << (*A)[MOffset + direction>0 ? i-1 : m - i];
352  }
353  else if (i==-1)
354  {
355  if (!prettyPrint) cout << "\t";
356  if (j==0) cout << "-";
357  else cout << (*B)[NOffset + direction>0 ? j-1 : n - j];
358  }
359  else
360  {
361  if (!prettyPrint) cout << "\t";
362  cout << H[i][j];
363  }
364  }
365  cout << endl;
366  }
367  }
368 
369  void debugPrint(bool doPrintH = true)
370  {
371  if (doPrintH) printH();
372  cout << "maxCostPosition = " << maxCostPosition << std::endl;
373  if (alignment.empty()) cout << "alignment vector is empty.\n";
374  else
375  {
376  cout << "alignment vector:\n";
377  for (vector<pair<int,int> >::iterator i=alignment.begin(); i < alignment.end(); i++)
378  {
379  cout << (i - alignment.begin()) << ": " << *i << "\n";
380  }
381  }
382  cout << std::endl;
383  }
384 
385 //
386 // Given the Matrix H as filled in by populateH, fill in the
387 // alignment vector with the indeces of the optimal match.
388 //
389  void populateAlignment()
390  {
391  alignment.clear();
392  int i = m, j = n;
393 
394  i = maxCostPosition.first;
395  j = maxCostPosition.second;
396 
397  //
398  // Stop when we either reach zero cost cell or
399  // when we reach the upper left corner of H.
400  // A zero cost cell to the lower right means we
401  // are soft clipping that end.
402  //
403  while (H[i][j] > 0 || (i>0 && j>0))
404  {
405 // #define DEBUG_ALIGNMENT_VECTOR
406 #if defined(DEBUG_ALIGNMENT_VECTOR)
407  cout << "alignment.push_back(" << i << ", " << j << ")" << endl;
408 #endif
409  alignment.push_back(pair<int,int>(i,j));
410  if (H[i-1][j-1]>=H[i-1][j] && H[i-1][j-1]>=H[i][j-1])
411  {
412  // diagonal upper left cell is biggest
413  i--;
414  j--;
415  }
416  else if (H[i-1][j] < H[i][j-1])
417  {
418  // upper cell is biggest
419  j--;
420  }
421  else
422  {
423  // left cell is biggest
424  i--;
425  }
426  }
427  alignment.push_back(pair<int,int>(i,j));
428 #if defined(DEBUG_ALIGNMENT_VECTOR)
429  cout << "alignment.push_back(" << i << ", " << j << ")" << endl;
430  cout << "alignment.size(): " << alignment.size() << endl;
431 #endif
432  }
433 
434  //
435  // Compute the sumQ for a read that has been mapped using populateH().
436  //
437  // In the simplest case, the read lies on the diagonal of the
438  // matrix H, which means it has only matches and mismatches:
439  // no inserts or deletes.
440  //
441  // However, in general, it is possible to have 0 or more insert,
442  // delete, mismatch and soft clipped bases in the read, so we
443  // need to accomodate all of those variations.
444  //
445  // XXX finish this.
446  //
447 
448  int getSumQ()
449  {
450  if (direction>0) return getSumQForward();
451  else return getSumQBackward();
452  }
453 
454  int getSumQForward()
455  {
456  int sumQ = 0;
457  vector<pair<int,int> >::reverse_iterator i;
458 
459  for (i=alignment.rbegin(); i < alignment.rend() - 1; i++)
460  {
461 // #define DEBUG_GETSUMQ
462 #if defined(DEBUG_GETSUMQ)
463  cout << *i << ": ";
464 #endif
465  if ((*(i+1)).first == ((*i).first+1) && (*(i+1)).second == ((*i).second + 1))
466  {
467  // match/mismatch
468 #if defined(DEBUG_GETSUMQ)
469  cout << "Match/Mismatch";
470 #endif
471  if ((*A)[MOffset + (*i).first] != (*B)[NOffset + (*i).second])
472  sumQ += (*qualities)[MOffset + (*i).first] - '!';
473  }
474  else if ((*(i+1)).first == ((*i).first+1) && (*(i+1)).second == ((*i).second))
475  {
476  // insert?
477 #if defined(DEBUG_GETSUMQ)
478  cout << "Insert";
479 #endif
480  sumQ += 50;
481  }
482  else if ((*(i+1)).first == ((*i).first) && (*(i+1)).second == ((*i).second + 1))
483  {
484  // delete?
485 #if defined(DEBUG_GETSUMQ)
486  cout << "Delete";
487 #endif
488  sumQ += 50;
489  }
490  }
491 #if defined(DEBUG_GETSUMQ)
492  cout << endl;
493 #endif
494  return sumQ;
495  }
496 
497  int getSumQBackward()
498  {
499  int sumQ = 0;
500  vector<pair<int,int> >::iterator i;
501 
502  for (i=alignment.begin(); i < alignment.end() - 1; i++)
503  {
504 #if defined(DEBUG_GETSUMQ)
505  cout << *i << ": ";
506 #endif
507  if ((*(i+1)).first == ((*i).first-1) && (*(i+1)).second == ((*i).second - 1))
508  {
509  // match/mismatch
510 #if defined(DEBUG_GETSUMQ)
511  cout << "Match/Mismatch";
512 #endif
513  if ((*A)[MOffset + m - (*i).first] != (*B)[NOffset + n - (*i).second])
514  sumQ += (*qualities)[MOffset + m - (*i).first] - '!';
515  }
516  else if ((*(i+1)).first == ((*i).first-1) && (*(i+1)).second == ((*i).second))
517  {
518  // insert?
519 #if defined(DEBUG_GETSUMQ)
520  cout << "Insert?";
521 #endif
522  sumQ += 50;
523  }
524  else if ((*(i+1)).first == ((*i).first) && (*(i+1)).second == ((*i).second - 1))
525  {
526  // delete?
527 #if defined(DEBUG_GETSUMQ)
528  cout << "Delete?";
529 #endif
530  sumQ += 50;
531  }
532  }
533 #if defined(DEBUG_GETSUMQ)
534  cout << endl;
535 #endif
536  return sumQ;
537  }
538 
539 #if 0
540  int getSumQ()
541  {
542  vector<pair<int,int> >::reverse_iterator i;
543  int sumQ = 0;
544  for (i=alignment.rbegin(); i < alignment.rend() - 1; i++)
545  {
546 #if defined(DEBUG_ALIGNMENT_VECTOR)
547  cout << "i: " << i - alignment.rbegin() << *i << endl;
548 #endif
549  // XXX NOT THIS SIMPLE - need to account for indels
550  if (direction>0)
551  {
552  if ((*A)[MOffset + (*i).first] != (*B)[NOffset + (*i).second])
553  sumQ += (*qualities)[MOffset + (*i).first] - '!';
554  }
555  else
556  {
557  // m and n are sizes, first and second are 1 based offsets
558  if ((*A)[MOffset + m - (*i).first] != (*B)[NOffset + n - (*i).second])
559  sumQ += (*qualities)[MOffset + m - (*i).first] - '!';
560  }
561  }
562  return sumQ;
563  }
564 #endif
565 
566  //
567  // Append cigar operations to an existing cigar list.
568  //
569  // XXX we no longer need the CigarRoller += methods.
570  //
571  // In this case, the Smith Waterman array H was created from
572  // the read and reference in the forward direction.
573  //
574  void rollCigarForward(CigarRoller &cigar)
575  {
576  vector<pair<int,int> >::reverse_iterator i;
577 
578  for (i=alignment.rbegin(); i < alignment.rend() - 1; i++)
579  {
580 // #define DEBUG_CIGAR
581 #if defined(DEBUG_CIGAR)
582  cout << *i << ": ";
583 #endif
584  if ((*(i+1)).first == ((*i).first+1) && (*(i+1)).second == ((*i).second + 1))
585  {
586  // match/mismatch
587 #if defined(DEBUG_CIGAR)
588  cout << "Match/Mismatch";
589 #endif
590  cigar.Add(CigarRoller::match, 1);
591  }
592  else if ((*(i+1)).first == ((*i).first+1) && (*(i+1)).second == ((*i).second))
593  {
594  // insert?
595 #if defined(DEBUG_CIGAR)
596  cout << "Insert";
597 #endif
598  cigar.Add(CigarRoller::insert, 1);
599  }
600  else if ((*(i+1)).first == ((*i).first) && (*(i+1)).second == ((*i).second + 1))
601  {
602  // delete?
603 #if defined(DEBUG_CIGAR)
604  cout << "Delete";
605 #endif
606  cigar.Add(CigarRoller::del, 1);
607  }
608  }
609  // if there is soft clipping, allow for it (::Add will
610  // ignore if the count is 0):
611  cigar.Add(CigarRoller::softClip, getSoftClipCount());
612 #if defined(DEBUG_CIGAR)
613  cout << endl;
614 #endif
615  }
616 
617  //
618  // Append cigar operations to an existing cigar list.
619  //
620  // XXX we no longer need the CigarRoller += methods.
621  //
622  // In this case, the Smith Waterman array H was created from
623  // the read and reference in the reverse direction.
624  //
625  void rollCigarBackward(CigarRoller &cigar)
626  {
627  vector<pair<int,int> >::iterator i;
628 
629  // if there is soft clipping, allow for it (::Add will
630  // ignore if the count is 0):
631  cigar.Add(CigarRoller::softClip, getSoftClipCount());
632 
633  i = alignment.begin();
634 
635  for (i=alignment.begin();
636  i < alignment.end() - 1;
637  i++)
638  {
639 #if defined(DEBUG_CIGAR)
640  cout << *i << ": ";
641 #endif
642  if ((*(i+1)).first == ((*i).first-1) && (*(i+1)).second == ((*i).second - 1))
643  {
644  // match/mismatch
645 #if defined(DEBUG_CIGAR)
646  cout << "Match/Mismatch";
647 #endif
648  cigar.Add(CigarRoller::match, 1);
649  }
650  else if ((*(i+1)).first == ((*i).first-1) && (*(i+1)).second == ((*i).second))
651  {
652  // insert?
653 #if defined(DEBUG_CIGAR)
654  cout << "Insert?";
655 #endif
656  cigar.Add(CigarRoller::insert, 1);
657  }
658  else if ((*(i+1)).first == ((*i).first) && (*(i+1)).second == ((*i).second - 1))
659  {
660  // delete?
661 #if defined(DEBUG_CIGAR)
662  cout << "Delete?";
663 #endif
664  cigar.Add(CigarRoller::del, 1);
665  }
666  }
667 #if defined(DEBUG_CIGAR)
668  cout << endl;
669 #endif
670  }
671 
672  //
673  // Given the direction, and the alignment vector, obtain
674  // the soft clip (the mismatches at the end of the string which
675  // can in Smith Waterman matching be considered as a separate case).
676  //
677  // NB: be careful that the backward case is correct - it passes
678  // all of two built in tests, but it may not be generally correct.
679  //
680  int getSoftClipCount()
681  {
682  if (direction>0)
683  {
684  // invariant: assert(maxCostPosition == alignment.front());
685  return m - maxCostPosition.first;
686  }
687  else
688  {
689 // return alignment.back().first; // nope, this always returns 0
690  // XXX BE CAREFUL... not sure this is right, either.
691 // return n - maxCostPosition.second;
692  return m - maxCostPosition.first;
693  }
694  }
695 
696  void rollCigar(CigarRoller &cigar)
697  {
698  if (direction>0) rollCigarForward(cigar);
699  else rollCigarBackward(cigar);
700  }
701 
702  //
703  // all in one local alignment:
704  //
705  // Steps:
706  // 1 - do internal setup
707  // 2 - populate H
708  // 3 - create alignment vector (this chooses the best path)
709  // 4 - compute sumQ
710  // 5 - compute the cigar string
711  // 6 - compute and update the softclip for the read
712  //
713  bool localAlignment(
714  uint32_t bandSize,
715  Atype &read,
716  readIndexType readLength,
717  QualityType &quality,
718  Btype &reference,
719  referenceIndexType referenceLength,
720  referenceIndexType referenceOffset,
721  CigarRoller &cigarRoller,
722  uint32_t &softClipCount,
723  referenceIndexType &cigarStartingPoint,
724  int &sumQ
725  )
726  {
727 
728  clear();
729 
730  cigarRoller.clear();
731 
732  setDirection(+1);
733  setAllowedInsertDelete(bandSize);
734 
735  setRead(&read);
736  setReadOffset(0);
737  setReadLength(readLength);
738 
739  setReadQuality(&quality);
740 
741  setReference(&reference);
742  setReferenceOffset(referenceOffset);
743  setReferenceLength(referenceLength);
744 
745  populateH();
746 
747  softClipCount = getSoftClipCount();
748 
749  populateAlignment();
750 
751  rollCigar(cigarRoller);
752 
753  sumQ = getSumQ();
754 
755  return false;
756 
757  };
758 
759 };
760 
761 #endif
insertion to the reference (the query sequence contains bases that have no corresponding base in the ...
Definition: Cigar.h:91
void Add(Operation operation, int count)
Append the specified operation with the specified count to this object.
Definition: CigarRoller.cpp:77
void clear()
Clear this object so that it has no Cigar Operations.
Soft clip on the read (clipped sequence present in the query sequence, but not in reference)...
Definition: Cigar.h:94
match/mismatch operation. Associated with CIGAR Operation "M"
Definition: Cigar.h:89
deletion from the reference (the reference contains bases that have no corresponding base in the quer...
Definition: Cigar.h:92
The purpose of this class is to provide accessors for setting, updating, modifying the CIGAR object...
Definition: CigarRoller.h:66