RDKit
Open-source cheminformatics and machine learning.
FingerprintGenerator.h
Go to the documentation of this file.
1 //
2 // Copyright (C) 2018 Boran Adas, Google Summer of Code
3 //
4 // @@ All Rights Reserved @@
5 // This file is part of the RDKit.
6 // The contents are covered by the terms of the BSD license
7 // which is included in the file license.txt, found at the root
8 // of the RDKit source tree.
9 //
10 
11 #include <RDGeneral/export.h>
12 #ifndef RD_FINGERPRINTGEN_H_2018_05
13 #define RD_FINGERPRINTGEN_H_2018_05
14 
18 #include <cstdint>
19 
20 namespace RDKit {
21 class ROMol;
22 
24  // will review this structure once more fignerprint types are implemented
25 
26  std::vector<std::vector<std::uint64_t>> *atomToBits;
27 
28  std::map<std::uint32_t, std::vector<std::pair<std::uint32_t, std::uint32_t>>>
30  // morgan fp
31  // maps bitId -> vector of (atomId, radius)
32 
33  std::pair<std::vector<std::vector<std::uint32_t>>,
34  std::map<std::uint32_t, std::vector<std::vector<int>>>> *bitInfo;
35  // rdkit fp
36  // first part, vector of bits set for each atom, must have the same size as
37  // atom count for molecule
38  // second part, maps bitId -> vector of paths
39 
40  std::vector<unsigned int> *atomCounts;
41  // number of paths that set bits for each atom, must have the same size as
42  // atom count for molecule
43 };
44 
45 /*!
46  \brief Abstract base class that holds molecule independent arguments that are
47  common amongst all fingerprint types and classes inherited from this would
48  hold fingerprint type specific arguments
49 
50  */
51 template <typename OutputType>
53  : private boost::noncopyable {
54  public:
55  FingerprintArguments(bool countSimulation,
56  const std::vector<std::uint32_t> countBounds,
57  std::uint32_t fpSize,
58  std::uint32_t numBitsPerFeature = 1);
59  const bool d_countSimulation;
60  const std::vector<std::uint32_t> d_countBounds;
61  const std::uint32_t d_fpSize;
62  const std::uint32_t d_numBitsPerFeature;
63 
64  /*!
65  \brief Returns the size of the fingerprint based on arguments
66 
67  \return OutputType size of the fingerprint
68  */
69  virtual OutputType getResultSize() const = 0;
70 
71  /**
72  \brief method that returns information string about the fingerprint specific
73  argument set and the arguments themselves
74 
75  \return std::string information string
76  */
77  virtual std::string infoString() const = 0;
78 
79  /**
80  \brief method that returns information string about common fingerprinting
81  arguments' values
82 
83  \return std::string information string
84  */
85  std::string commonArgumentsString() const;
86 
87  virtual ~FingerprintArguments(){};
88 };
89 
90 /*!
91  \brief abstract base class that holds atom-environments that will be hashed to
92  generate the fingerprint
93 
94  */
95 template <typename OutputType>
96 class RDKIT_FINGERPRINTS_EXPORT AtomEnvironment : private boost::noncopyable {
97  public:
98  /*!
99  \brief calculates and returns the bit id to be set for this atom-environment
100 
101  \param arguments Fingerprinting type specific molecule independent
102  arguments
103  \param atomInvariants Atom-invariants to be used during hashing
104  \param bondInvariants Bond-invariants to be used during hashing
105  \param hashResults if set results will be ready to be modded
106 
107  \return OutputType calculated bit id for this environment
108  */
109  virtual OutputType getBitId(FingerprintArguments<OutputType> *arguments,
110  const std::vector<std::uint32_t> *atomInvariants,
111  const std::vector<std::uint32_t> *bondInvariants,
113  const bool hashResults = false) const = 0;
114 
115  virtual ~AtomEnvironment(){};
116 };
117 
118 /*!
119  \brief abstract base class that generates atom-environments from a molecule
120 
121  */
122 template <typename OutputType>
124  : private boost::noncopyable {
125  public:
126  /*!
127  \brief generate and return all atom-envorinments from a molecule
128 
129  \param mol molecule to generate the atom-environments from
130  \param arguments fingerprint type specific molecule independent
131  arguments
132  \param fromAtoms atoms to be used during environment generation,
133  usage of this parameter depends on the implementation of different
134  fingerprint types
135  \param ignoreAtoms atoms to be ignored during environment generation,
136  usage of this parameter depends on the implementation of different
137  fingerprint types
138  \param confId which conformation to use during environment
139  generation, needed for some fingerprint types
140  \param additionalOutput contains pointers for additional outputs of
141  fingerprinting operation, usage depends on implementation of the fingerprint
142  type
143  \param atomInvariants atom invariants to be used during environment
144  generation, in some cases some of the hashing can be done during environment
145  generation so it is also passed here
146  \param bondInvariants bond invariants to be used during environment
147  generation, same as atomInvariants it might be needed
148  \param hashResults if set results will be ready to be modded
149 
150  \return std::vector<AtomEnvironment *> atom-environments generated from
151  this molecule
152  */
153  virtual std::vector<AtomEnvironment<OutputType> *> getEnvironments(
154  const ROMol &mol, FingerprintArguments<OutputType> *arguments,
155  const std::vector<std::uint32_t> *fromAtoms = nullptr,
156  const std::vector<std::uint32_t> *ignoreAtoms = nullptr,
157  const int confId = -1, const AdditionalOutput *additionalOutput = nullptr,
158  const std::vector<std::uint32_t> *atomInvariants = nullptr,
159  const std::vector<std::uint32_t> *bondInvariants = nullptr,
160  const bool hashResults = false) const = 0;
161 
162  /**
163  \brief method that returns information about this /c AtomEnvironmentGenerator
164  and its arguments if any
165 
166  \return std::string information string
167  */
168  virtual std::string infoString() const = 0;
169 
171 };
172 
173 /*!
174  \brief abstract base class for atom invariants generators
175 
176  */
178  : private boost::noncopyable {
179  public:
180  /*!
181  \brief get atom invariants from a molecule
182 
183  \param mol molecule to generate the atom invariants for
184 
185  \return std::vector<std::uint32_t> atom invariants generated for the given
186  molecule
187  */
188  virtual std::vector<std::uint32_t> *getAtomInvariants(
189  const ROMol &mol) const = 0;
190 
191  /**
192  \brief method that returns information about this /c AtomInvariantsGenerator
193  and its arguments
194 
195  \return std::string information string
196  */
197  virtual std::string infoString() const = 0;
198 
200  virtual AtomInvariantsGenerator *clone() const = 0;
201 };
202 
203 /*!
204  \brief abstract base class for bond invariants generators
205 
206  */
208  : private boost::noncopyable {
209  public:
210  /*!
211  \brief get bond invariants from a molecule
212 
213  \param mol molecule to generate the bond invariants for
214 
215  \return std::vector<std::uint32_t> bond invariants generated for the given
216  molecule
217  */
218  virtual std::vector<std::uint32_t> *getBondInvariants(
219  const ROMol &mol) const = 0;
220 
221  /**
222  \brief method that returns information about this /c BondInvariantsGenerator
223  and its arguments
224 
225  \return std::string information string
226  */
227  virtual std::string infoString() const = 0;
228 
230  virtual BondInvariantsGenerator *clone() const = 0;
231 }; // namespace RDKit
232 
233 /*!
234  \brief class that generates same fingerprint style for different output
235  formats
236 
237  */
238 template <typename OutputType>
240  : private boost::noncopyable {
241  FingerprintArguments<OutputType> *dp_fingerprintArguments;
242  AtomEnvironmentGenerator<OutputType> *dp_atomEnvironmentGenerator;
243  AtomInvariantsGenerator *dp_atomInvariantsGenerator;
244  BondInvariantsGenerator *dp_bondInvariantsGenerator;
245  const bool df_ownsAtomInvGenerator;
246  const bool df_ownsBondInvGenerator;
247 
248  SparseIntVect<OutputType> *getFingerprintHelper(
249  const ROMol &mol, const std::vector<std::uint32_t> *fromAtoms = nullptr,
250  const std::vector<std::uint32_t> *ignoreAtoms = nullptr,
251  const int confId = -1, const AdditionalOutput *additionalOutput = nullptr,
252  const std::vector<std::uint32_t> *customAtomInvariants = nullptr,
253  const std::vector<std::uint32_t> *customBondInvariants = nullptr,
254  const std::uint64_t fpSize = 0) const;
255 
256  public:
258  AtomEnvironmentGenerator<OutputType> *atomEnvironmentGenerator,
259  FingerprintArguments<OutputType> *fingerprintArguments,
260  AtomInvariantsGenerator *atomInvariantsGenerator = nullptr,
261  BondInvariantsGenerator *bondInvariantsGenerator = nullptr,
262  bool ownsAtomInvGenerator = false, bool ownsBondInvGenerator = false);
263 
265 
266  SparseIntVect<OutputType> *getSparseCountFingerprint(
267  const ROMol &mol, const std::vector<std::uint32_t> *fromAtoms = nullptr,
268  const std::vector<std::uint32_t> *ignoreAtoms = nullptr,
269  const int confId = -1, const AdditionalOutput *additionalOutput = nullptr,
270  const std::vector<std::uint32_t> *customAtomInvariants = nullptr,
271  const std::vector<std::uint32_t> *customBondInvariants = nullptr) const;
272 
273  SparseBitVect *getSparseFingerprint(
274  const ROMol &mol, const std::vector<std::uint32_t> *fromAtoms = nullptr,
275  const std::vector<std::uint32_t> *ignoreAtoms = nullptr,
276  const int confId = -1, const AdditionalOutput *additionalOutput = nullptr,
277  const std::vector<std::uint32_t> *customAtomInvariants = nullptr,
278  const std::vector<std::uint32_t> *customBondInvariants = nullptr) const;
279 
280  SparseIntVect<std::uint32_t> *getCountFingerprint(
281  const ROMol &mol, const std::vector<std::uint32_t> *fromAtoms = nullptr,
282  const std::vector<std::uint32_t> *ignoreAtoms = nullptr,
283  const int confId = -1, const AdditionalOutput *additionalOutput = nullptr,
284  const std::vector<std::uint32_t> *customAtomInvariants = nullptr,
285  const std::vector<std::uint32_t> *customBondInvariants = nullptr) const;
286 
288  const ROMol &mol, const std::vector<std::uint32_t> *fromAtoms = nullptr,
289  const std::vector<std::uint32_t> *ignoreAtoms = nullptr,
290  const int confId = -1, const AdditionalOutput *additionalOutput = nullptr,
291  const std::vector<std::uint32_t> *customAtomInvariants = nullptr,
292  const std::vector<std::uint32_t> *customBondInvariants = nullptr) const;
293 
294  std::string infoString() const;
295 };
296 
298 
299 //! used to indicate errors for unimplemented fp types in convenience functions
301  : public std::exception {
302  public:
303  //! construct with an error message
304  UnimplementedFPException(const char *msg) : _msg(msg){};
305  //! construct with an error message
306  UnimplementedFPException(const std::string &msg) : _msg(msg){};
307  //! get the error message
308  const char *message() const { return _msg.c_str(); };
310 
311  private:
312  std::string _msg;
313 };
314 
315 // convenience functions, fingerprint generation with default values
316 
317 RDKIT_FINGERPRINTS_EXPORT SparseIntVect<std::uint64_t> *getSparseCountFP(
318  const ROMol &mol, FPType fPType);
319 
321  FPType fPType);
322 
323 RDKIT_FINGERPRINTS_EXPORT SparseIntVect<std::uint32_t> *getCountFP(
324  const ROMol &mol, FPType fPType);
325 
327  FPType fPType);
328 
329 RDKIT_FINGERPRINTS_EXPORT std::vector<SparseIntVect<std::uint64_t> *> *
330 getSparseCountFPBulk(const std::vector<const ROMol *> molVector, FPType fPType);
331 
332 RDKIT_FINGERPRINTS_EXPORT std::vector<SparseBitVect *> *getSparseFPBulk(
333  const std::vector<const ROMol *> molVector, FPType fPType);
334 
335 RDKIT_FINGERPRINTS_EXPORT std::vector<SparseIntVect<std::uint32_t> *>
336  *getCountFPBulk(const std::vector<const ROMol *> molVector, FPType fPType);
337 
338 RDKIT_FINGERPRINTS_EXPORT std::vector<ExplicitBitVect *> *getFPBulk(
339  const std::vector<const ROMol *> molVector, FPType fPType);
340 
341 } // namespace RDKit
342 
343 #endif
RDKit::FPType
FPType
Definition: FingerprintGenerator.h:297
RDKit::FingerprintGenerator
class that generates same fingerprint style for different output formats
Definition: FingerprintGenerator.h:239
RDKit::FingerprintArguments::d_fpSize
const std::uint32_t d_fpSize
Definition: FingerprintGenerator.h:61
RDKit::UnimplementedFPException::~UnimplementedFPException
~UnimplementedFPException()
Definition: FingerprintGenerator.h:309
RDKit::FingerprintArguments::~FingerprintArguments
virtual ~FingerprintArguments()
Definition: FingerprintGenerator.h:87
RDKit::UnimplementedFPException::UnimplementedFPException
UnimplementedFPException(const char *msg)
construct with an error message
Definition: FingerprintGenerator.h:304
RDKit::AdditionalOutput::bitInfoMap
std::map< std::uint32_t, std::vector< std::pair< std::uint32_t, std::uint32_t > > > * bitInfoMap
Definition: FingerprintGenerator.h:29
RDKit::FPType::TopologicalTorsionFP
@ TopologicalTorsionFP
RDKit::AdditionalOutput
Definition: FingerprintGenerator.h:23
RDKit::AtomEnvironmentGenerator::~AtomEnvironmentGenerator
virtual ~AtomEnvironmentGenerator()
Definition: FingerprintGenerator.h:170
RDKit::getSparseFPBulk
RDKIT_FINGERPRINTS_EXPORT std::vector< SparseBitVect * > * getSparseFPBulk(const std::vector< const ROMol * > molVector, FPType fPType)
RDKit::getSparseFP
RDKIT_FINGERPRINTS_EXPORT SparseBitVect * getSparseFP(const ROMol &mol, FPType fPType)
RDKit::getSparseCountFP
RDKIT_FINGERPRINTS_EXPORT SparseIntVect< std::uint64_t > * getSparseCountFP(const ROMol &mol, FPType fPType)
RDKit::AdditionalOutput::atomToBits
std::vector< std::vector< std::uint64_t > > * atomToBits
Definition: FingerprintGenerator.h:26
ExplicitBitVect.h
RDKit::ROMol
Definition: ROMol.h:171
RDKit::AtomEnvironment::~AtomEnvironment
virtual ~AtomEnvironment()
Definition: FingerprintGenerator.h:115
RDKit::UnimplementedFPException::message
const char * message() const
get the error message
Definition: FingerprintGenerator.h:308
RDKit::AtomInvariantsGenerator
abstract base class for atom invariants generators
Definition: FingerprintGenerator.h:177
RDKit::MorganFingerprints::getFingerprint
RDKIT_FINGERPRINTS_EXPORT SparseIntVect< std::uint32_t > * getFingerprint(const ROMol &mol, unsigned int radius, std::vector< std::uint32_t > *invariants=0, const std::vector< std::uint32_t > *fromAtoms=0, bool useChirality=false, bool useBondTypes=true, bool useCounts=true, bool onlyNonzeroInvariants=false, BitInfoMap *atomsSettingBits=0)
returns the Morgan fingerprint for a molecule
RDKit::BondInvariantsGenerator
abstract base class for bond invariants generators
Definition: FingerprintGenerator.h:207
RDKit::AdditionalOutput::bitInfo
std::pair< std::vector< std::vector< std::uint32_t > >, std::map< std::uint32_t, std::vector< std::vector< int > > > > * bitInfo
Definition: FingerprintGenerator.h:34
RDKit::AdditionalOutput::atomCounts
std::vector< unsigned int > * atomCounts
Definition: FingerprintGenerator.h:40
RDKit::getCountFPBulk
RDKIT_FINGERPRINTS_EXPORT std::vector< SparseIntVect< std::uint32_t > * > * getCountFPBulk(const std::vector< const ROMol * > molVector, FPType fPType)
RDKit::AtomEnvironment
abstract base class that holds atom-environments that will be hashed to generate the fingerprint
Definition: FingerprintGenerator.h:96
RDKit::getFPBulk
RDKIT_FINGERPRINTS_EXPORT std::vector< ExplicitBitVect * > * getFPBulk(const std::vector< const ROMol * > molVector, FPType fPType)
RDKit
Std stuff.
Definition: Atom.h:30
RDKit::AtomEnvironmentGenerator
abstract base class that generates atom-environments from a molecule
Definition: FingerprintGenerator.h:123
RDKit::getFP
RDKIT_FINGERPRINTS_EXPORT ExplicitBitVect * getFP(const ROMol &mol, FPType fPType)
RDKit::UnimplementedFPException::UnimplementedFPException
UnimplementedFPException(const std::string &msg)
construct with an error message
Definition: FingerprintGenerator.h:306
RDKit::FingerprintArguments::d_numBitsPerFeature
const std::uint32_t d_numBitsPerFeature
Definition: FingerprintGenerator.h:62
SparseBitVect
a class for bit vectors that are sparsely occupied.
Definition: SparseBitVect.h:34
SparseIntVect.h
RDKit::FPType::MorganFP
@ MorganFP
RDKit::UnimplementedFPException
used to indicate errors for unimplemented fp types in convenience functions
Definition: FingerprintGenerator.h:300
RDKIT_FINGERPRINTS_EXPORT
#define RDKIT_FINGERPRINTS_EXPORT
Definition: export.h:242
RDKit::BondInvariantsGenerator::~BondInvariantsGenerator
virtual ~BondInvariantsGenerator()
Definition: FingerprintGenerator.h:229
RDKit::FPType::AtomPairFP
@ AtomPairFP
RDKit::FPType::RDKitFP
@ RDKitFP
RDKit::getCountFP
RDKIT_FINGERPRINTS_EXPORT SparseIntVect< std::uint32_t > * getCountFP(const ROMol &mol, FPType fPType)
RDKit::FingerprintArguments::d_countBounds
const std::vector< std::uint32_t > d_countBounds
Definition: FingerprintGenerator.h:60
RDKit::SparseIntVect
a class for efficiently storing sparse vectors of ints
Definition: SparseIntVect.h:28
RDKit::AtomInvariantsGenerator::~AtomInvariantsGenerator
virtual ~AtomInvariantsGenerator()
Definition: FingerprintGenerator.h:199
RDKit::FingerprintArguments
Abstract base class that holds molecule independent arguments that are common amongst all fingerprint...
Definition: FingerprintGenerator.h:52
RDKit::FingerprintArguments::d_countSimulation
const bool d_countSimulation
Definition: FingerprintGenerator.h:59
RDKit::getSparseCountFPBulk
RDKIT_FINGERPRINTS_EXPORT std::vector< SparseIntVect< std::uint64_t > * > * getSparseCountFPBulk(const std::vector< const ROMol * > molVector, FPType fPType)
SparseBitVect.h
ExplicitBitVect
a class for bit vectors that are densely occupied
Definition: ExplicitBitVect.h:29
export.h