libStatGen Software  1
SamFileHeader.h
1 /*
2  * Copyright (C) 2010 Regents of the University of Michigan
3  *
4  * This program is free software: you can redistribute it and/or modify
5  * it under the terms of the GNU General Public License as published by
6  * the Free Software Foundation, either version 3 of the License, or
7  * (at your option) any later version.
8  *
9  * This program is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12  * GNU General Public License for more details.
13  *
14  * You should have received a copy of the GNU General Public License
15  * along with this program. If not, see <http://www.gnu.org/licenses/>.
16  */
17 
18 #ifndef __SAM_FILE_HEADER_H__
19 #define __SAM_FILE_HEADER_H__
20 
21 #include <map>
22 #include <stdint.h>
23 
24 #include "SamReferenceInfo.h"
25 #include "SamHeaderHD.h"
26 #include "SamHeaderSQ.h"
27 #include "SamHeaderRG.h"
28 #include "SamHeaderPG.h"
29 
30 /// This class allows a user to get/set the fields in a SAM/BAM Header.
31 /// Sam/Bam headers contain comments and multiple SamHeaderRecords
32 /// (HD, SQs, RGs, PGs) comprised of tag/value pairs with each tag only
33 /// appearing once within a specific record.
35 {
36 public:
37  SamFileHeader();
38  ~SamFileHeader();
39 
40  /////////////////////////////
41  /// @name Copying a Header
42  /// These methods are ways of copying the contents of one header into
43  /// another one.
44  //@{
45 
46  /// Copy Constructor copies the specified header into this one.
47  SamFileHeader(const SamFileHeader& header);
48 
49  /// Overload operator = to copy the passed in header into this header.
50  SamFileHeader & operator = (const SamFileHeader& header);
51 
52  /// Copy method copies the passed in header into this header.
53  /// Returns true if at least one header line was successfully copied.
54  bool copy(const SamFileHeader& header);
55  //@}
56 
57  /// Initialize the header.
58  void resetHeader();
59 
60  /////////////////////////////
61  /// @name Get the Entire Header
62  /// Get the entire header as a single string.
63  //@{
64 
65  /// Set the passed in string to the entire header string, clearing its
66  /// current contents.
67  /// \return true if successfully set (even if set to "")
68  bool getHeaderString(std::string& header) const;
69 
70  //@}
71 
72  /// Get the reference ID for the specified reference name (chromosome).
73  /// If addID is set to true, a reference id will be created for the
74  /// referenceName if one does not already exist. If addID is set to
75  /// false (default), it will return SamReferenceInfo::NO_REF_ID.
76  int getReferenceID(const String & referenceName, bool addID = false);
77 
78  /// Get the reference ID for the specified reference name (chromosome).
79  /// If addID is set to true, a reference id will be created for the
80  /// referenceName if one does not already exist. If addID is set to
81  /// false (default), it will return SamReferenceInfo::NO_REF_ID.
82  int getReferenceID(const char* referenceName, bool addID = false);
83 
84  /// Return the reference name (chromosome) for the specified reference id.
85  const String & getReferenceLabel(int id) const;
86 
87  /// Get the Reference Information
88  const SamReferenceInfo& getReferenceInfo() const;
89 
90  // Get the Reference Information for updating separately when reading
91  // BAMs...should only be called by BamInterface.
92  SamReferenceInfo& getReferenceInfoForBamInterface();
93 
94  ////////////////////////////////////////////////////////////////////////
95  // Set Values in the header
96  ////////////////////////////////////////////////////////////////////////
97 
98  /////////////////////////////////////////
99  /// @name Adding an entire header/comment line.
100  /// These methods are ways of adding an entire header line at once.
101  //@{
102 
103  /// Add a header line that is just one tag with a const char* value.
104  /// Note: This method will only do one tag per type on a line, so if a
105  /// type has multiple tags, the whole line needs to be added at once,
106  /// and a different method should be used.
107  bool addHeaderLine(const char* type, const char* tag, const char* value);
108 
109  /// Add a header line that is already preformatted in a const char*.
110  /// Returns true if at least one header line was successfully added.
111  bool addHeaderLine(const char* headerLine);
112 
113  /// Add a header that is already preformatted in a const char*.
114  /// Returns true if at least one header line was successfully added.
115  bool addHeader(const char* header);
116 
117  /// Add the specified comment to the header (do not include "@CO" or "\n").
118  /// \return true if successfully added, false if not.
119  bool addComment(const char* comment);
120 
121  //@}
122 
123 
124  /////////////////////////////////////////
125  /// @name Set/Add/Remove a Single Tag
126  /// The passed in tag should be the two character SAM tag as defined
127  /// in the SAM spec. A tag is removed from the header record by setting
128  /// it to "". For the SQ and RG header types, the key tags (SN for SQ
129  /// and ID for RG) may not be modified or removed once set. This is
130  /// because these values are used as a lookup key for the header record,
131  /// so the entire record must be removed.
132  //@{
133 
134 // // Set the specified header type tag to the specified value in the
135 // // header with the specified keyID. keyID must be specified when
136 // // type = SQ, RG, or PG.
137 // bool setTag(SamHeaderRecord::SamHeaderRecordType type, const char* tag,
138 // const char* value, const char* keyID = NULL);
139 
140  /// Set the specified tag to the specified value in the HD header, remove
141  /// the tag by specifying value="".
142  /// \return true if the tag was successfully set, false if not.
143  bool setHDTag(const char* tag, const char* value);
144 
145  /// Set the specified tag to the specified value in the SQ header with
146  /// the specified name, remove the tag by specifying value="". If the
147  /// header does not yet exist, the tag must be "LN" and the header is added
148  /// with the specified LN value and the SN value passed in name.
149  /// The SN & LN tags may not be modified or removed after they are
150  /// set unless the entire record is deleted.
151  /// \return true if the tag was successfully set, false if not.
152  bool setSQTag(const char* tag, const char* value, const char* name);
153 
154  /// Set the specified tag to the specified value in the RG header with
155  /// the specified id, remove the tag by specifying value="". If the
156  /// header does not yet exist, the header is added and so is the ID tag
157  /// with the value set to the passed in id. The ID tag may not be
158  /// modified or removed after it is set unless the entire record is deleted.
159  /// \return true if the tag was successfully set, false if not.
160  bool setRGTag(const char* tag, const char* value, const char* id);
161 
162  /// Set the specified tag to the specified value in the PG header with
163  /// the specified id, remove the tag by specifying value="". If the
164  /// header does not yet exist, the header is added and so is the ID tag
165  /// with the value set to the passed in id. The ID tag may not be
166  /// modified or removed after it is set unless the entire record is deleted.
167  /// \return true if the tag was successfully set, false if not.
168  bool setPGTag(const char* tag, const char* value, const char* id);
169 
170  //@}
171 
172  /////////////////////////////////////////
173  /// @name Add an Already Setup SamHeaderRecord
174  /// NOTE: These methods add a pointer to the passed in record.
175  /// The header record will be deleted when it's cleaned up from this header.
176  /// NOTE: Do NOT delete the passed in record, the SamFileHeader class
177  /// takes care of that itself.
178  //@{
179 
180  /// Add the HD record to the header.
181  /// Note: it adds a pointer to the passed in header record. The header
182  /// record will be deleted when it is cleaned up from this header.
183  /// \return true if the record was successfully added, false otherwise.
184  bool addHD(SamHeaderHD* hd);
185 
186  /// Add the SQ record to the header.
187  /// Note: it adds a pointer to the passed in header record. The header
188  /// record will be deleted when it is cleaned up from this header.
189  /// \return true if the record was successfully added, false otherwise.
190  bool addSQ(SamHeaderSQ* sq);
191 
192  /// Add the RG record to the header.
193  /// Note: it adds a pointer to the passed in header record. The header
194  /// record will be deleted when it is cleaned up from this header.
195  /// \return true if the record was successfully added, false otherwise.
196  bool addRG(SamHeaderRG* rg);
197 
198  /// Add the PG record to the header.
199  /// Note: it adds a pointer to the passed in header record. The header
200  /// record will be deleted when it is cleaned up from this header.
201  /// \return true if the record was successfully added, false otherwise.
202  bool addPG(SamHeaderPG* pg);
203 
204  /// Add a copy of the specified header record to the header.
205  /// Note: it creates a new header record that is identical to the specified
206  /// one and adds it to the header. The passed in pointer will not be
207  /// deleted due to this.
208  /// \return true if the record was successfully added, false otherwise.
209  bool addRecordCopy(const SamHeaderRecord& hdrRec);
210 
211  //@}
212 
213  ////////////////////////////////////////////////////////////////////////
214  /// @name Remove an Entire Header Record
215  //@{
216 
217  /// Remove the HD record.
218  /// \return true if successfully removed or did not exist, false if
219  /// the record still exists.
220  bool removeHD();
221 
222  /// Remove SQ record with the specified key.
223  /// NOTE: Does not remove it from the BAM index.
224  /// \return true if successfully removed or did not exist, false if
225  /// the record still exists.
226  bool removeSQ(const char* name);
227 
228  /// Remove RG record with the specified key.
229  /// \return true if successfully removed or did not exist, false if
230  /// the record still exists.
231  bool removeRG(const char* id);
232 
233  /// Remove PG record with the specified key.
234  /// \return true if successfully removed or did not exist, false if
235  /// the record still exists.
236  bool removePG(const char* id);
237 
238  //@}
239 
240  ////////////////////////////////////////////////////////////////////////
241  /// @name Get a Specific Tag
242  /// These methods return the value associated with the specified tag.
243  /// If the tag does not exist in the record "" is returned.
244  ///
245  /// For SQ, RG, and PG the value returned is for the tag associated with
246  /// the specified key (name/id). If a record with that key does not exist
247  /// or if the tag does not exist for the record with that key, "" is
248  /// returned.
249  //@{
250 
251  /// Returns the value associated with the specified HD tag, returning "" if
252  /// the tag does not exist in the header.
253  const char* getHDTagValue(const char* tag);
254 
255  /// Get the value associated with the specified tag on the SQ line with
256  /// the specified sequence name, returning "" if the tag or key does
257  /// not exist.
258  const char* getSQTagValue(const char* tag, const char* name);
259 
260  /// Get the value associated with the specified tag on the RG line with
261  /// the specified read group identifier, returning "" if the tag or key does
262  /// not exist.
263  const char* getRGTagValue(const char* tag, const char* id);
264 
265  /// Get the value associated with the specified tag on the RG line with
266  /// the specified id, returning "" if the tag or key does
267  /// not exist.
268  const char* getPGTagValue(const char* tag, const char* id);
269 
270  //@}
271 
272  /// Get the number of SQ objects.
273  int getNumSQs();
274 
275  /// Get the number of RG objects.
276  int getNumRGs();
277 
278  /// Get the number of PG objects.
279  int getNumPGs();
280 
281  ////////////////////////////////////////////////////////////////////////
282  /// @name Get a Specific Header Record
283  /// These methods return a reference to the specific record that was
284  /// requested, returning NULL if that record does not exist in the header.
285  ///
286  /// The returned record can be modified to add/remove some tags.
287  /// Since a reference is returned, the SamHeaderFile automatically
288  /// reflects these changes.
289  //@{
290 
291  /// Get the HD object, returning NULL if there is no HD record.
292  SamHeaderHD* getHD();
293 
294  /// Get the SQ object with the specified sequence name, returning NULL
295  /// if there is no SQ object with that key.
296  SamHeaderSQ* getSQ(const char* name);
297 
298  /// Get the RG object with the specified read group identifier, returning
299  /// NULL if there is no RG object with that key..
300  SamHeaderRG* getRG(const char* id);
301 
302  /// Get the PG object with the specified id, returning NULL
303  /// if there is no PG object with that key..
304  SamHeaderPG* getPG(const char* id);
305 
306  //@}
307 
308 // //////////////////////////////////
309 // // Set methods for header fields.
310 // bool setVersion(const char* version);
311 // bool setSortOrder(const char* sortOrder);
312 // bool addSequenceName(const char* sequenceName);
313 // bool setSequenceLength(const char* keyID, int sequenceLength);
314 // bool setGenomeAssemblyId(const char* keyID, const char* genomeAssemblyId);
315 // bool setMD5Checksum(const char* keyID, const char* md5sum);
316 // bool setURI(const char* keyID, const char* uri);
317 // bool setSpecies(const char* keyID, const char* species);
318 // bool addReadGroupID(const char* readGroupID);
319 // bool setSample(const char* keyID, const char* sample);
320 // bool setLibrary(const char* keyID, const char* library);
321 // bool setDescription(const char* keyID, const char* description);
322 // bool setPlatformUnit(const char* keyID, const char* platform);
323 // bool setPredictedMedianInsertSize(const char* keyID, const char* isize);
324 // bool setSequencingCenter(const char* keyID, const char* center);
325 // bool setRunDate(const char* keyID, const char* runDate);
326 // bool setTechnology(const char* keyID, const char* technology);
327 // bool addProgram(const char* programID);
328 // bool setProgramVersion(const char* keyID, const char* version);
329 // bool setCommandLine(const char* keyID, const char* commandLine);
330 
331 // ///////////////////////////////////
332 // // Get methods for header fields.
333 // // Returns the number of SQ entries in the header.
334 // int32_t getSequenceDictionaryCount();
335 
336  /// Return the Sort Order value that is set in the Header, returning ""
337  /// if this field does not exist.
338  const char* getSortOrder();
339 
340 
341  /// DEPRECATED
342  const char* getTagSO();
343 
344  /////////////////////////////
345  /// @name Get the Header Record/Comment/Line by Record/Comment/Line
346  /// These methods iterate through the header.
347  /// NOTE: both getNextHeaderRecord and getNextHeaderLine increment the
348  /// same iterator. getNextHeaderRecord that takes a header type
349  /// uses the same iterator as the getNextXXRecord with that type.
350  /// Otherwise the iterators are independent.
351  //@{
352 
353  /// Get the next SQ header record. After all SQ headers have been
354  /// retrieved, NULL is returned until a reset is called.
355  /// Independent from getNextHeaderRecord, getNextHeaderLine and the
356  /// other getNextXXRecord methods and the associated reset methods.
358 
359  /// Get the next RG header record. After all RG headers have been
360  /// retrieved, NULL is returned until a reset is called.
361  /// Independent from getNextHeaderRecord, getNextHeaderLine and the
362  /// other getNextXXRecord methods and the associated reset methods.
364 
365  /// Get the next PG header record. After all PG headers have been
366  /// retrieved, NULL is returned until a reset is called.
367  /// Independent from getNextHeaderRecord, getNextHeaderLine and the
368  /// other getNextXXRecord methods and the associated reset methods.
370 
371  /// Reset to the beginning of the header records so the next call
372  /// to getNextSQRecord returns the first SQ header record.
373  void resetSQRecordIter();
374 
375  /// Reset to the beginning of the header records so the next call
376  /// to getNextRGRecord returns the first RG header record.
377  void resetRGRecordIter();
378 
379  /// Reset to the beginning of the header records so the next call
380  /// to getNextPGRecord returns the first PG header record.
381  void resetPGRecordIter();
382 
383  /// Get the next header record of the specified type starting from the
384  /// specified index and update the index.
385  /// After all headers of that type have been retrieved,
386  /// NULL is returned until a reset is called for that type.
387  SamHeaderRecord* getNextHeaderRecord(uint32_t& index,
389 
390  /// Get the next header record, but not comment line. After all headers
391  /// have been retrieved, NULL is returned until a reset is called.
392  /// NOTE: both getNextHeaderRecord and getNextHeaderLine increment the
393  /// same iterator.
395 
396  /// Set the passed in string to the next header line, overwritting
397  /// the passed in string. If there are no more header lines or there
398  /// is an error, false is returned and the passed in string is set to ""
399  /// until a rest is called.
400  /// NOTE: both getNextHeaderRecord and getNextHeaderLine increment the
401  /// same iterator.
402  bool getNextHeaderLine(std::string &headerLine);
403 
404  /// Reset to the beginning of the header records so the next call
405  /// to getNextHeaderRecord returns the first header line.
406  void resetHeaderRecordIter();
407 
408  /// Append all of the comment lines to the specified string.
409  void appendCommentLines(std::string &commentLines);
410 
411  /// Returns the comment on the next comment line. Returns "" if all comment
412  /// lines have been returned, until resetCommentIter is called.
413  const char* getNextComment();
414 
415  /// Resets to the beginning of the comments so getNextComment returns
416  /// the first comment.
417  void resetCommentIter();
418 
419  //@}
420 
421 
422  /// Get the failure message if a method returned failure.
423  const char* getErrorMessage() { return(myErrorMessage.c_str()); }
424 
425  static const std::string EMPTY_RETURN;
426 
427 private:
428  // Parse the header string.
429  bool parseHeader(String& header);
430 
431  // Parse the specified line of the header.
432  bool parseHeaderLine(const String& headerLine);
433 
434  // Set the passed in string to the header line at the specified index.
435  // It does NOT clear the current contents of header.
436  bool getHeaderLine(unsigned int index, std::string& header) const;
437 
438  int16_t makeKey(char ch1, char ch2)
439  {
440  return((ch1 << 8) + ch2);
441  }
442 
443  // Only one HD type is allowed per file.
444  SamHeaderHD* myHD;
445 
446  // There can be multiple SQ Types, indexed by SN.
447  StringHash mySQs;
448 
449  // There can be multiple RG Types, indexed by ID.
450  StringHash myRGs;
451 
452  // There can be multiple PG types, indexed by ID.
453  StringHash myPGs;
454 
455  // Reference Name information
456  SamReferenceInfo myReferenceInfo;
457 
458  // Vector of comments
459  std::vector<std::string> myComments;
460 
461  std::vector<SamHeaderRecord*> myHeaderRecords;
462 
463  std::string myErrorMessage;
464 
465  uint32_t myCurrentSQIndex;
466 
467  uint32_t myCurrentRGIndex;
468 
469  uint32_t myCurrentPGIndex;
470 
471  uint32_t myCurrentHeaderIndex;
472 
473  uint32_t myCurrentCommentIndex;
474 };
475 
476 #endif
477 
SamHeaderPG * getPG(const char *id)
Get the PG object with the specified id, returning NULL if there is no PG object with that key...
SamHeaderRecordType
Specifies the Type for the sam header record (line).
SamHeaderRG * getRG(const char *id)
Get the RG object with the specified read group identifier, returning NULL if there is no RG object w...
const char * getTagSO()
DEPRECATED.
const char * getRGTagValue(const char *tag, const char *id)
Get the value associated with the specified tag on the RG line with the specified read group identifi...
SamFileHeader & operator=(const SamFileHeader &header)
Overload operator = to copy the passed in header into this header.
const char * getNextComment()
Returns the comment on the next comment line.
SamHeaderSQ * getSQ(const char *name)
Get the SQ object with the specified sequence name, returning NULL if there is no SQ object with that...
bool addRecordCopy(const SamHeaderRecord &hdrRec)
Add a copy of the specified header record to the header.
const char * getHDTagValue(const char *tag)
Returns the value associated with the specified HD tag, returning "" if the tag does not exist in the...
const char * getPGTagValue(const char *tag, const char *id)
Get the value associated with the specified tag on the RG line with the specified id...
const String & getReferenceLabel(int id) const
Return the reference name (chromosome) for the specified reference id.
bool copy(const SamFileHeader &header)
Copy method copies the passed in header into this header.
int getNumSQs()
Get the number of SQ objects.
const SamReferenceInfo & getReferenceInfo() const
Get the Reference Information.
bool addSQ(SamHeaderSQ *sq)
Add the SQ record to the header.
void resetHeaderRecordIter()
Reset to the beginning of the header records so the next call to getNextHeaderRecord returns the firs...
bool setPGTag(const char *tag, const char *value, const char *id)
Set the specified tag to the specified value in the PG header with the specified id, remove the tag by specifying value="".
bool setHDTag(const char *tag, const char *value)
Set the specified tag to the specified value in the HD header, remove the tag by specifying value=""...
bool removeHD()
Remove the HD record.
void appendCommentLines(std::string &commentLines)
Append all of the comment lines to the specified string.
Class for tracking the reference information mapping between the reference ids and the reference name...
bool setSQTag(const char *tag, const char *value, const char *name)
Set the specified tag to the specified value in the SQ header with the specified name, remove the tag by specifying value="".
SamHeaderHD * getHD()
Get the HD object, returning NULL if there is no HD record.
bool getHeaderString(std::string &header) const
Set the passed in string to the entire header string, clearing its current contents.
bool removeRG(const char *id)
Remove RG record with the specified key.
SamHeaderRecord * getNextPGRecord()
Get the next PG header record.
bool addHeader(const char *header)
Add a header that is already preformatted in a const char*.
void resetHeader()
Initialize the header.
bool addHD(SamHeaderHD *hd)
Add the HD record to the header.
const char * getSortOrder()
Return the Sort Order value that is set in the Header, returning "" if this field does not exist...
bool removeSQ(const char *name)
Remove SQ record with the specified key.
void resetPGRecordIter()
Reset to the beginning of the header records so the next call to getNextPGRecord returns the first PG...
SamHeaderRecord * getNextHeaderRecord()
Get the next header record, but not comment line.
bool setRGTag(const char *tag, const char *value, const char *id)
Set the specified tag to the specified value in the RG header with the specified id, remove the tag by specifying value="".
bool addRG(SamHeaderRG *rg)
Add the RG record to the header.
bool addComment(const char *comment)
Add the specified comment to the header (do not include "@CO" or "\n").
const char * getErrorMessage()
Get the failure message if a method returned failure.
int getNumRGs()
Get the number of RG objects.
bool getNextHeaderLine(std::string &headerLine)
Set the passed in string to the next header line, overwritting the passed in string.
This class allows a user to get/set the fields in a SAM/BAM Header.
Definition: SamFileHeader.h:34
void resetCommentIter()
Resets to the beginning of the comments so getNextComment returns the first comment.
SamHeaderRecord * getNextRGRecord()
Get the next RG header record.
const char * getSQTagValue(const char *tag, const char *name)
Get the value associated with the specified tag on the SQ line with the specified sequence name...
bool addHeaderLine(const char *type, const char *tag, const char *value)
Add a header line that is just one tag with a const char* value.
bool removePG(const char *id)
Remove PG record with the specified key.
void resetSQRecordIter()
Reset to the beginning of the header records so the next call to getNextSQRecord returns the first SQ...
bool addPG(SamHeaderPG *pg)
Add the PG record to the header.
int getReferenceID(const String &referenceName, bool addID=false)
Get the reference ID for the specified reference name (chromosome).
This class encapsulates the tag value pairs contained with a SAM Header line with accessors for getti...
int getNumPGs()
Get the number of PG objects.
void resetRGRecordIter()
Reset to the beginning of the header records so the next call to getNextRGRecord returns the first RG...
SamHeaderRecord * getNextSQRecord()
Get the next SQ header record.