RDKit
Open-source cheminformatics and machine learning.
FPBReader.h
Go to the documentation of this file.
1 //
2 // Copyright (c) 2016 Greg Landrum
3 //
4 // @@ All Rights Reserved @@
5 // This file is part of the RDKit.
6 // The contents are covered by the terms of the BSD license
7 // which is included in the file license.txt, found at the root
8 // of the RDKit source tree.
9 //
10 #include <RDGeneral/export.h>
11 #ifndef RD_FPBREADER_H_DEC2015
12 #define RD_FPBREADER_H_DEC2015
13 /*! \file FPBReader.h
14 
15  \brief contains a simple class for reading and searching FPB files
16 
17  \b Note that this functionality is experimental and the API may change
18  in future releases.
19 */
20 
21 #include <iostream>
22 #include <fstream>
23 #include <sstream>
24 #include <string>
27 
28 #include <cstdint>
29 #include <boost/shared_ptr.hpp>
30 #include <boost/shared_array.hpp>
31 
32 namespace RDKit {
33 namespace detail {
34 struct FPBReader_impl;
35 }
36 
37 //! class for reading and searching FPB files
38 /*!
39  basic usage:
40  \code
41  FPBReader reader("foo.fpb");
42  reader.init();
43  boost::shared_ptr<ExplicitBitVect> ebv = reader.getFP(95);
44  std::vector<std::pair<double, unsigned int> > nbrs =
45  reader.getTanimotoNeighbors(*ebv.get(), 0.70);
46  \endcode
47 
48  \b Note: this functionality is experimental and the API may change
49  in future releases.
50 
51  <b>Note on thread safety</b>
52  Operations that involve reading from the FPB file are not thread safe.
53  This means that the \c init() method is not thread safe and none of the
54  search operations are thread safe when an \c FPBReader is initialized in
55  \c lazyRead mode.
56 
57 */
59  public:
61  : dp_istrm(NULL),
62  dp_impl(NULL),
63  df_owner(false),
64  df_init(false),
65  df_lazyRead(false){};
66  //! ctor for reading from a named file
67  /*!
68  \param fname the name of the file to reads
69  \param lazyRead if set to \c false all fingerprints from the file will be read
70  into memory when \c init() is called.
71  */
72  FPBReader(const char *fname, bool lazyRead = false) {
73  _initFromFilename(fname, lazyRead);
74  };
75  //! \overload
76  FPBReader(const std::string &fname, bool lazyRead = false) {
77  _initFromFilename(fname.c_str(), lazyRead);
78  };
79  //! ctor for reading from an open istream
80  /*!
81  \param inStream the stream to read from
82  \param takeOwnership if set, we will take over ownership of the stream pointer
83  \param lazyRead if set to \c false all fingerprints from the file will be read
84  into memory when \c init() is called.
85 
86  Some additional notes:
87  - if \c lazyRead is set, \c inStream must support the \c seekg() and \c
88  tellg() operations.
89 
90  */
91  FPBReader(std::istream *inStream, bool takeOwnership = true,
92  bool lazyRead = false)
93  : dp_istrm(inStream),
94  dp_impl(NULL),
95  df_owner(takeOwnership),
96  df_init(false),
97  df_lazyRead(lazyRead){};
99  destroy();
100  if (df_owner) delete dp_istrm;
101  dp_istrm = NULL;
102  df_init = false;
103  };
104 
105  //! Read the data from the file and initialize internal data structures
106  /*!
107  This must be called before most of the other methods of this clases.
108 
109  Some notes:
110  \li if \c lazyRead is not set, all fingerprints will be read into memory. This
111  can require substantial amounts of memory for large files.
112  \li For large files, this can take a long time.
113  \li If \c lazyRead and \c takeOwnership are both \c false it is safe to close
114  and delete inStream after calling \c init()
115  */
116  void init();
117  //! cleanup
118  /*!
119  Cleans up whatever memory was allocated during init()
120  */
121  void cleanup() {
122  if (!df_init) return;
123  destroy();
124  df_init = false;
125  };
126  //! returns the requested fingerprint as an \c ExplicitBitVect
127  boost::shared_ptr<ExplicitBitVect> getFP(unsigned int idx) const;
128  //! returns the requested fingerprint as an array of bytes
129  boost::shared_array<std::uint8_t> getBytes(unsigned int idx) const;
130 
131  //! returns the id of the requested fingerprint
132  std::string getId(unsigned int idx) const;
133  //! returns the fingerprint and id of the requested fingerprint
134  std::pair<boost::shared_ptr<ExplicitBitVect>, std::string> operator[](
135  unsigned int idx) const {
136  return std::make_pair(getFP(idx), getId(idx));
137  };
138 
139  //! returns beginning and end indices of fingerprints having on-bit counts
140  //! within the range (including end points)
141  std::pair<unsigned int, unsigned int> getFPIdsInCountRange(
142  unsigned int minCount, unsigned int maxCount);
143 
144  //! returns the number of fingerprints
145  unsigned int length() const;
146  //! returns the number of bits in our fingerprints
147  unsigned int nBits() const;
148 
149  //! returns the tanimoto similarity between the specified fingerprint and the
150  //! provided fingerprint
151  double getTanimoto(unsigned int idx, const std::uint8_t *bv) const;
152  //! \overload
153  double getTanimoto(unsigned int idx,
154  boost::shared_array<std::uint8_t> bv) const {
155  return getTanimoto(idx, bv.get());
156  };
157  //! \overload
158  double getTanimoto(unsigned int idx, const ExplicitBitVect &ebv) const;
159 
160  //! returns tanimoto neighbors that are within a similarity threshold
161  /*!
162  The result vector of (similarity,index) pairs is sorted in order
163  of decreasing similarity
164 
165  \param bv the query fingerprint
166  \param threshold the minimum similarity to return
167  \param usePopcountScreen if this is true (the default) the popcount of the
168  neighbors will be used to reduce the number of calculations that need
169  to be done
170 
171  */
172  std::vector<std::pair<double, unsigned int>> getTanimotoNeighbors(
173  const std::uint8_t *bv, double threshold = 0.7,
174  bool usePopcountScreen = true) const;
175  //! \overload
176  std::vector<std::pair<double, unsigned int>> getTanimotoNeighbors(
177  boost::shared_array<std::uint8_t> bv, double threshold = 0.7,
178  bool usePopcountScreen = true) const {
179  return getTanimotoNeighbors(bv.get(), threshold, usePopcountScreen);
180  };
181  //! \overload
182  std::vector<std::pair<double, unsigned int>> getTanimotoNeighbors(
183  const ExplicitBitVect &ebv, double threshold = 0.7,
184  bool usePopcountScreen = true) const;
185 
186  //! returns the Tversky similarity between the specified fingerprint and the
187  //! provided fingerprint
188  /*!
189 
190  \param idx the fingerprint to compare to
191  \param bv the query fingerprint
192  \param ca the Tversky a coefficient
193  \param cb the Tversky a coefficient
194 
195  */
196  double getTversky(unsigned int idx, const std::uint8_t *bv, double ca,
197  double cb) const;
198  //! \overload
199  double getTversky(unsigned int idx, boost::shared_array<std::uint8_t> bv,
200  double ca, double cb) const {
201  return getTversky(idx, bv.get(), ca, cb);
202  };
203  //! \overload
204  double getTversky(unsigned int idx, const ExplicitBitVect &ebv, double ca,
205  double cb) const;
206 
207  //! returns Tversky neighbors that are within a similarity threshold
208  /*!
209  The result vector of (similarity,index) pairs is sorted in order
210  of decreasing similarity
211 
212  \param bv the query fingerprint
213  \param ca the Tversky a coefficient
214  \param cb the Tversky a coefficient
215  \param threshold the minimum similarity to return
216  \param usePopcountScreen if this is true (the default) the popcount of the
217  neighbors will be used to reduce the number of calculations that need
218  to be done
219 
220  */
221  std::vector<std::pair<double, unsigned int>> getTverskyNeighbors(
222  const std::uint8_t *bv, double ca, double cb, double threshold = 0.7,
223  bool usePopcountScreen = true) const;
224  //! \overload
225  std::vector<std::pair<double, unsigned int>> getTverskyNeighbors(
226  boost::shared_array<std::uint8_t> bv, double ca, double cb,
227  double threshold = 0.7, bool usePopcountScreen = true) const {
228  return getTverskyNeighbors(bv.get(), ca, cb, threshold, usePopcountScreen);
229  };
230  //! \overload
231  std::vector<std::pair<double, unsigned int>> getTverskyNeighbors(
232  const ExplicitBitVect &ebv, double ca, double cb, double threshold = 0.7,
233  bool usePopcountScreen = true) const;
234 
235  //! returns indices of all fingerprints that completely contain this one
236  /*! (i.e. where all the bits set in the query are also set in the db
237  molecule)
238  */
239  std::vector<unsigned int> getContainingNeighbors(
240  const std::uint8_t *bv) const;
241  //! \overload
242  std::vector<unsigned int> getContainingNeighbors(
243  boost::shared_array<std::uint8_t> bv) const {
244  return getContainingNeighbors(bv.get());
245  };
246  //! \overload
247  std::vector<unsigned int> getContainingNeighbors(
248  const ExplicitBitVect &ebv) const;
249 
250  private:
251  std::istream *dp_istrm;
252  detail::FPBReader_impl *dp_impl; // implementation details
253  bool df_owner;
254  bool df_init;
255  bool df_lazyRead;
256 
257  // disable automatic copy constructors and assignment operators
258  // for this class and its subclasses. They will likely be
259  // carrying around stream pointers and copying those is a recipe
260  // for disaster.
261  FPBReader(const FPBReader &);
262  FPBReader &operator=(const FPBReader &);
263  void destroy();
264  void _initFromFilename(const char *fname, bool lazyRead) {
265  std::istream *tmpStream = static_cast<std::istream *>(
266  new std::ifstream(fname, std::ios_base::binary));
267  if (!(*tmpStream) || (tmpStream->bad())) {
268  std::ostringstream errout;
269  errout << "Bad input file " << fname;
270  delete tmpStream;
271  throw BadFileException(errout.str());
272  }
273  dp_istrm = tmpStream;
274  dp_impl = NULL;
275  df_owner = true;
276  df_init = false;
277  df_lazyRead = lazyRead;
278  }
279 };
280 } // namespace RDKit
281 #endif
BadFileException.h
RDKit::FPBReader::getTanimoto
double getTanimoto(unsigned int idx, boost::shared_array< std::uint8_t > bv) const
This is an overloaded member function, provided for convenience. It differs from the above function o...
Definition: FPBReader.h:153
RDKit::FPBReader::operator[]
std::pair< boost::shared_ptr< ExplicitBitVect >, std::string > operator[](unsigned int idx) const
returns the fingerprint and id of the requested fingerprint
Definition: FPBReader.h:134
RDKit::FPBReader::getContainingNeighbors
std::vector< unsigned int > getContainingNeighbors(boost::shared_array< std::uint8_t > bv) const
This is an overloaded member function, provided for convenience. It differs from the above function o...
Definition: FPBReader.h:242
RDKit::FPBReader::getTanimotoNeighbors
std::vector< std::pair< double, unsigned int > > getTanimotoNeighbors(boost::shared_array< std::uint8_t > bv, double threshold=0.7, bool usePopcountScreen=true) const
This is an overloaded member function, provided for convenience. It differs from the above function o...
Definition: FPBReader.h:176
RDKit::FPBReader::~FPBReader
~FPBReader()
Definition: FPBReader.h:98
RDKit::FPBReader::FPBReader
FPBReader(std::istream *inStream, bool takeOwnership=true, bool lazyRead=false)
ctor for reading from an open istream
Definition: FPBReader.h:91
ExplicitBitVect.h
RDKIT_DATASTRUCTS_EXPORT
#define RDKIT_DATASTRUCTS_EXPORT
Definition: export.h:112
RDKit::FPBReader::cleanup
void cleanup()
cleanup
Definition: FPBReader.h:121
RDKit::FPBReader::FPBReader
FPBReader()
Definition: FPBReader.h:60
RDKit::FPBReader::getTverskyNeighbors
std::vector< std::pair< double, unsigned int > > getTverskyNeighbors(boost::shared_array< std::uint8_t > bv, double ca, double cb, double threshold=0.7, bool usePopcountScreen=true) const
This is an overloaded member function, provided for convenience. It differs from the above function o...
Definition: FPBReader.h:225
RDKit
Std stuff.
Definition: Atom.h:30
RDKit::getFP
RDKIT_FINGERPRINTS_EXPORT ExplicitBitVect * getFP(const ROMol &mol, FPType fPType)
RDKit::FPBReader::FPBReader
FPBReader(const char *fname, bool lazyRead=false)
ctor for reading from a named file
Definition: FPBReader.h:72
RDKit::FPBReader::FPBReader
FPBReader(const std::string &fname, bool lazyRead=false)
This is an overloaded member function, provided for convenience. It differs from the above function o...
Definition: FPBReader.h:76
RDKit::BadFileException
used by various file parsing classes to indicate a bad file
Definition: BadFileException.h:21
RDKit::FPBReader::getTversky
double getTversky(unsigned int idx, boost::shared_array< std::uint8_t > bv, double ca, double cb) const
This is an overloaded member function, provided for convenience. It differs from the above function o...
Definition: FPBReader.h:199
RDKit::FPBReader
class for reading and searching FPB files
Definition: FPBReader.h:58
ExplicitBitVect
a class for bit vectors that are densely occupied
Definition: ExplicitBitVect.h:29
export.h