RDKit
Open-source cheminformatics and machine learning.
MolSupplier.h
Go to the documentation of this file.
1 //
2 // Copyright (C) 2002-2019 greg landrum, Rational Discovery LLC
3 //
4 // @@ All Rights Reserved @@
5 // This file is part of the RDKit.
6 // The contents are covered by the terms of the BSD license
7 // which is included in the file license.txt, found at the root
8 // of the RDKit source tree.
9 //
10 #include <RDGeneral/export.h>
11 #ifndef RD_MOLSUPPLIER_H
12 #define RD_MOLSUPPLIER_H
13 
14 #include <RDGeneral/types.h>
15 
16 #include <string>
17 #include <list>
18 #include <memory>
19 #include <vector>
20 #include <iostream>
21 #include <GraphMol/ROMol.h>
22 
23 #ifdef RDK_BUILD_COORDGEN_SUPPORT
24 namespace schrodinger {
25 namespace mae {
26 class Reader;
27 class Block;
28 } // namespace mae
29 } // namespace schrodinger
30 #endif // RDK_BUILD_COORDGEN_SUPPORT
31 
32 namespace RDKit {
33 RDKIT_FILEPARSERS_EXPORT std::string strip(const std::string &orig);
34 
35 /*!
36 //
37 // Here are a couple of ways one can interact with MolSuppliers:
38 //
39 // 1) Lazy (ForwardIterator):
40 // while(!supplier.atEnd()){
41 // ROMol *mol = supplier.next();
42 // if(mol){
43 // do something;
44 // }
45 // }
46 // 2) Random Access:
47 // for(int i=0;i<supplier.length();i++){
48 // ROMol *mol = supplier[i];
49 // if(mol){
50 // do something;
51 // }
52 // }
53 //
54 //
55 */
57  // this is an abstract base class to supply molecules one at a time
58  public:
60  virtual ~MolSupplier(){};
61  virtual void init() = 0;
62  virtual void reset() = 0;
63  virtual bool atEnd() = 0;
64  virtual ROMol *next() = 0;
65 
66  private:
67  // disable automatic copy constructors and assignment operators
68  // for this class and its subclasses. They will likely be
69  // carrying around stream pointers and copying those is a recipe
70  // for disaster.
71  MolSupplier(const MolSupplier &);
72  MolSupplier &operator=(const MolSupplier &);
73 
74  protected:
75  // stream to read the molecules from:
76  std::istream *dp_inStream = nullptr;
77  // do we own dp_inStream?
78  bool df_owner = false;
79 };
80 
81 // \brief a supplier from an SD file that only reads forward:
83  /*************************************************************************
84  * A lazy mol supplier from a SD file.
85  * - When new molecules are read using "next" their positions in the file are
86  *noted.
87  ***********************************************************************************/
88  public:
89  ForwardSDMolSupplier() { init(); };
90 
91  explicit ForwardSDMolSupplier(std::istream *inStream,
92  bool takeOwnership = true, bool sanitize = true,
93  bool removeHs = true,
94  bool strictParsing = false);
95 
97  if (df_owner && dp_inStream) {
98  delete dp_inStream;
99  df_owner = false;
100  dp_inStream = NULL;
101  }
102  };
103 
104  virtual void init();
105  virtual void reset();
106  virtual ROMol *next();
107  virtual bool atEnd();
108 
109  void setProcessPropertyLists(bool val) { df_processPropertyLists = val; }
110  bool getProcessPropertyLists() const { return df_processPropertyLists; }
111 
112  bool getEOFHitOnRead() const { return df_eofHitOnRead; }
113 
114  protected:
115  virtual void checkForEnd();
116  ROMol *_next();
117  virtual void readMolProps(ROMol *);
118  bool df_end = false;
119  int d_line = 0; // line number we are currently on
120  bool df_sanitize = true, df_removeHs = true, df_strictParsing = true;
121  bool df_processPropertyLists = true;
122  bool df_eofHitOnRead = false;
123 };
124 
125 // \brief a lazy supplier from an SD file
127  /*************************************************************************
128  * A lazy mol supplier from a SD file.
129  * - When new molecules are read using "next" their positions in the file are
130  *noted.
131  * - A call to the "length" will automatically parse the entire file and
132  *cache all the mol
133  * block positions
134  * - [] operator is used to access a molecule at "idx", calling next
135  *following this will result
136  * in the next molecule after "idx"
137  ***********************************************************************************/
138 
139  public:
140  SDMolSupplier() { init(); };
141 
142  /*!
143  * \param fileName - the name of the SD file
144  * \param sanitize - if true sanitize the molecule before returning it
145  * \param removeHs - if true remove Hs from the molecule before returning it
146  * (triggers sanitization)
147  * \param strictParsing - if not set, the parser is more lax about
148  * correctness
149  * of the contents.
150  */
151  explicit SDMolSupplier(const std::string &fileName, bool sanitize = true,
152  bool removeHs = true, bool strictParsing = true);
153 
154  explicit SDMolSupplier(std::istream *inStream, bool takeOwnership = true,
155  bool sanitize = true, bool removeHs = true,
156  bool strictParsing = true);
157 
159  void init();
160  void reset();
161  ROMol *next();
162  bool atEnd();
163  void moveTo(unsigned int idx);
164  ROMol *operator[](unsigned int idx);
165  /*! \brief returns the text block for a particular item
166  *
167  * \param idx - which item to return
168  */
169  std::string getItemText(unsigned int idx);
170  unsigned int length();
171  void setData(const std::string &text, bool sanitize = true,
172  bool removeHs = true);
173  void setData(const std::string &text, bool sanitize, bool removeHs,
174  bool strictParsing);
175 
176  /*! Resets our internal state and sets the indices of molecules in the stream.
177  * The client should be *very* careful about calling this method, as it's
178  *trivial
179  * to end up with a completely useless supplier.
180  *
181  * \param locs - the vector of stream positions.
182  *
183  * Note that this can be used not only to make reading selected molecules
184  *from a
185  * large SD file much faster, but it can also allow subsetting an SD file or
186  * rearranging the order of the molecules.
187  */
188  void setStreamIndices(const std::vector<std::streampos> &locs);
189 
190  private:
191  void checkForEnd();
192  void setDataCommon(const std::string &text, bool sanitize, bool removeHs);
193  int d_len = 0; // total number of mol blocks in the file (initialized to -1)
194  int d_last = 0; // the molecule we are ready to read
195  std::vector<std::streampos> d_molpos;
196 };
197 
198 //! lazy file parser for Smiles tables
200  /**************************************************************************
201  * Lazy file parser for Smiles table file, similar to the lazy SD
202  * file parser above
203  * - As an when new molecules are read using "next" their
204  * positions in the file are noted.
205  * - A call to the "length" will autamatically parse the entire
206  * file and cache all the mol block positions
207  * - [] operator is used to access a molecule at "idx", calling
208  * next following this will result in the next molecule after
209  * "idx"
210  ***************************************************************************/
211  public:
212  /*!
213  * \param fileName - the name of smiles table file
214  * \param delimiter - delimiting characters between records on a each
215  * line NOTE that this is not a string, the tokenizer looks for
216  * the individual characters in delimiter, not the full string
217  * itself. So the default delimiter: " \t", means " " or "\t".
218  * \param smilesColumn - column number for the SMILES string (defaults
219  * to the first column)
220  * \param nameColumn - column number for the molecule name (defaults to
221  * the second column) If set to -1 we assume that no name is
222  * available for the molecule and the name is defaulted to the
223  * smiles string
224  * \param titleLine - if true, the first line is assumed to list the
225  * names of properties in order seperated by 'delimiter'. It is
226  * also assume that the 'SMILES' column and the 'name' column
227  * are not specified here if false - no title line is assumed
228  * and the properties are recorded as the "columnX" where "X" is
229  * the column number
230  * \param sanitize - if true sanitize the molecule before returning it
231  */
232  explicit SmilesMolSupplier(const std::string &fileName,
233  const std::string &delimiter = " \t",
234  int smilesColumn = 0, int nameColumn = 1,
235  bool titleLine = true, bool sanitize = true);
237  explicit SmilesMolSupplier(std::istream *inStream, bool takeOwnership = true,
238  const std::string &delimiter = " \t",
239  int smilesColumn = 0, int nameColumn = 1,
240  bool titleLine = true, bool sanitize = true);
241 
243  void setData(const std::string &text, const std::string &delimiter = " ",
244  int smilesColumn = 0, int nameColumn = 1, bool titleLine = true,
245  bool sanitize = true);
246  void init();
247  void reset();
248  ROMol *next();
249  bool atEnd();
250  void moveTo(unsigned int idx);
251  ROMol *operator[](unsigned int idx);
252  /*! \brief returns the text block for a particular item
253  *
254  * \param idx - which item to return
255  */
256  std::string getItemText(unsigned int idx);
257  unsigned int length();
258 
259  private:
260  ROMol *processLine(std::string inLine);
261  void processTitleLine();
262  std::string nextLine();
263  long int skipComments();
264  void checkForEnd();
265 
266  bool df_end = false; // have we reached the end of the file?
267  int d_len = 0; // total number of smiles in the file
268  int d_next = 0; // the molecule we are ready to read
269  int d_line = 0; // line number we are currently on
270  std::vector<std::streampos>
271  d_molpos; // vector of positions in the file for molecules
272  std::vector<int> d_lineNums;
273  std::string d_delim; // the delimiter string
274  bool df_sanitize = true; // sanitize molecules before returning them?
275  STR_VECT d_props; // vector of property names
276  bool df_title = true; // do we have a title line?
277  int d_smi = 0; // column id for the smile string
278  int d_name = 1; // column id for the name
279 };
280 
281 //! lazy file parser for TDT files
283  /**************************************************************************
284  * Lazy file parser for TDT files, similar to the lazy SD
285  * file parser above
286  * - As an when new molecules are read using "next" their
287  * positions in the file are noted.
288  * - A call to the "length" will autamatically parse the entire
289  * file and cache all the mol block positions
290  * - [] operator is used to access a molecule at "idx", calling
291  * next following this will result in the next molecule after
292  * "idx"
293  ***************************************************************************/
294  public:
295  /*!
296  * \param fileName - the name of the TDT file
297  * \param nameRecord - property name for the molecule name.
298  * If empty (the default), the name defaults to be empty
299  * \param confId2D - if >=0 and 2D coordinates are provided, the 2D
300  * structure (depiction) in the input will be read into the
301  * corresponding conformer id.
302  * \param confId3D - if >=0 and 3D coordinates are provided, the 3D
303  * structure (depiction) in the input will be read into the
304  * corresponding conformer id.
305  * \param sanitize - if true sanitize the molecule before returning it
306  */
307  explicit TDTMolSupplier(const std::string &fileName,
308  const std::string &nameRecord = "", int confId2D = -1,
309  int confId3D = 0, bool sanitize = true);
310  explicit TDTMolSupplier(std::istream *inStream, bool takeOwnership = true,
311  const std::string &nameRecord = "", int confId2D = -1,
312  int confId3D = 0, bool sanitize = true);
313  TDTMolSupplier();
314  ~TDTMolSupplier();
315  void setData(const std::string &text, const std::string &nameRecord = "",
316  int confId2D = -1, int confId3D = 0, bool sanitize = true);
317  void init();
318  void reset();
319  ROMol *next();
320  bool atEnd();
321  void moveTo(unsigned int idx);
322  ROMol *operator[](unsigned int idx);
323  /*! \brief returns the text block for a particular item
324  *
325  * \param idx - which item to return
326  */
327  std::string getItemText(unsigned int idx);
328  unsigned int length();
329 
330  private:
331  bool advanceToNextRecord();
332  void checkForEnd();
333  ROMol *parseMol(std::string inLine);
334 
335  bool df_end = false; // have we reached the end of the file?
336  int d_len = 0; // total number of mols in the file
337  int d_last = 0; // the molecule we are ready to read
338  int d_line = 0; // line number we are currently on
339  int d_confId2D = -1; // id to use for 2D conformers
340  int d_confId3D = 0; // id to use for 3D conformers
341  std::vector<std::streampos>
342  d_molpos; // vector of positions in the file for molecules
343  bool df_sanitize = true; // sanitize molecules before returning them?
344  std::string d_nameProp =
345  ""; // local storage for the property providing mol names
346 };
347 
348 //! lazy file parser for PDB files
350  public:
351  explicit PDBMolSupplier(std::istream *inStream, bool takeOwnership = true,
352  bool sanitize = true, bool removeHs = true,
353  unsigned int flavor = 0,
354  bool proximityBonding = true);
355  explicit PDBMolSupplier(const std::string &fname, bool sanitize = true,
356  bool removeHs = true, unsigned int flavor = 0,
357  bool proximityBonding = true);
358 
359  virtual ~PDBMolSupplier() {
360  if (df_owner && dp_inStream) delete dp_inStream;
361  };
362 
363  virtual void init();
364  virtual void reset();
365  virtual ROMol *next();
366  virtual bool atEnd();
367 
368  protected:
369  bool df_sanitize, df_removeHs, df_proximityBonding;
370  unsigned int d_flavor;
371 };
372 #ifdef RDK_BUILD_COORDGEN_SUPPORT
373 //! lazy file parser for MAE files
374 class RDKIT_FILEPARSERS_EXPORT MaeMolSupplier : public MolSupplier {
375  /**
376  * Due to maeparser's shared_ptr<istream> Reader interface, MaeMolSupplier
377  * always requires taking ownership of the istream ptr, as the shared ptr will
378  * always clear it upon destruction.
379  */
380 
381  public:
382  MaeMolSupplier() { init(); };
383 
384  explicit MaeMolSupplier(std::shared_ptr<std::istream> inStream,
385  bool sanitize = true, bool removeHs = true);
386 
387  explicit MaeMolSupplier(std::istream *inStream, bool takeOwnership = true,
388  bool sanitize = true, bool removeHs = true);
389 
390  explicit MaeMolSupplier(const std::string &fname, bool sanitize = true,
391  bool removeHs = true);
392 
393  virtual ~MaeMolSupplier(){
394  // The dp_sInStream shared_ptr will take care of cleaning up.
395  };
396 
397  virtual void init();
398  virtual void reset();
399  virtual ROMol *next();
400  virtual bool atEnd();
401 
402  protected:
403  bool df_sanitize, df_removeHs;
404  std::shared_ptr<schrodinger::mae::Reader> d_reader;
405  std::shared_ptr<schrodinger::mae::Block> d_next_struct;
406  std::shared_ptr<std::istream> dp_sInStream;
407 };
408 #endif // RDK_BUILD_COORDGEN_SUPPORT
409 } // namespace RDKit
410 
411 #endif
RDKit::SDMolSupplier::SDMolSupplier
SDMolSupplier()
Definition: MolSupplier.h:140
RDKit::ForwardSDMolSupplier::ForwardSDMolSupplier
ForwardSDMolSupplier()
Definition: MolSupplier.h:89
RDKit::ForwardSDMolSupplier::getEOFHitOnRead
bool getEOFHitOnRead() const
Definition: MolSupplier.h:112
ROMol.h
Defines the primary molecule class ROMol as well as associated typedefs.
types.h
RDKit::SDMolSupplier::~SDMolSupplier
~SDMolSupplier()
Definition: MolSupplier.h:158
RDKit::SDMolSupplier
Definition: MolSupplier.h:126
RDKit::ForwardSDMolSupplier::setProcessPropertyLists
void setProcessPropertyLists(bool val)
Definition: MolSupplier.h:109
RDKIT_FILEPARSERS_EXPORT
#define RDKIT_FILEPARSERS_EXPORT
Definition: export.h:216
RDKit::MolSupplier
Definition: MolSupplier.h:56
RDKit::strip
RDKIT_FILEPARSERS_EXPORT std::string strip(const std::string &orig)
RDKit::ForwardSDMolSupplier
Definition: MolSupplier.h:82
RDKit::STR_VECT
std::vector< std::string > STR_VECT
Definition: Dict.h:29
RDKit::PDBMolSupplier::d_flavor
unsigned int d_flavor
Definition: MolSupplier.h:370
RDKit::MolSupplier::MolSupplier
MolSupplier()
Definition: MolSupplier.h:59
RDKit::ForwardSDMolSupplier::getProcessPropertyLists
bool getProcessPropertyLists() const
Definition: MolSupplier.h:110
RDKit::PDBMolSupplier::~PDBMolSupplier
virtual ~PDBMolSupplier()
Definition: MolSupplier.h:359
RDKit::ROMol
Definition: ROMol.h:171
RDKit::ForwardSDMolSupplier::~ForwardSDMolSupplier
virtual ~ForwardSDMolSupplier()
Definition: MolSupplier.h:96
RDKit::MolSupplier::~MolSupplier
virtual ~MolSupplier()
Definition: MolSupplier.h:60
RDKit::PDBMolSupplier::df_sanitize
bool df_sanitize
Definition: MolSupplier.h:369
RDKit::TDTMolSupplier
lazy file parser for TDT files
Definition: MolSupplier.h:282
RDKit
Std stuff.
Definition: Atom.h:30
RDKit::SmilesMolSupplier
lazy file parser for Smiles tables
Definition: MolSupplier.h:199
RDKit::PDBMolSupplier
lazy file parser for PDB files
Definition: MolSupplier.h:349
RDKit::MolOps::removeHs
RDKIT_GRAPHMOL_EXPORT ROMol * removeHs(const ROMol &mol, bool implicitOnly=false, bool updateExplicitCount=false, bool sanitize=true)
returns a copy of a molecule with hydrogens removed
export.h