RDKit
Open-source cheminformatics and machine learning.
BitOps.h
Go to the documentation of this file.
1 //
2 // Copyright (C) 2003-2012 greg Landrum and Rational Discovery LLC
3 //
4 // @@ All Rights Reserved @@
5 // This file is part of the RDKit.
6 // The contents are covered by the terms of the BSD license
7 // which is included in the file license.txt, found at the root
8 // of the RDKit source tree.
9 //
10 #include <RDGeneral/export.h>
11 #ifndef __RD_BITOPS_H__
12 #define __RD_BITOPS_H__
13 /*! \file BitOps.h
14 
15  \brief Contains general bit-comparison and similarity operations.
16 
17  The notation used to document the similarity metrics is:
18  - \c V1_n: number of bits in vector 1
19  - \c V1_o: number of on bits in vector 1
20  - <tt>(V1&V2)_o</tt>: number of on bits in the intersection of vectors 1 and
21  2
22 
23  */
24 
25 #include "BitVects.h"
26 #include <string>
27 
28 //! general purpose wrapper for calculating the similarity between two bvs
29 //! that may be of unequal size (will automatically fold as appropriate)
30 template <typename T>
31 double SimilarityWrapper(const T& bv1, const T& bv2,
32  double (*metric)(const T&, const T&),
33  bool returnDistance = false) {
34  double res = 0.0;
35  if (bv1.getNumBits() > bv2.getNumBits()) {
36  T* bv1tmp = FoldFingerprint(bv1, bv1.getNumBits() / bv2.getNumBits());
37  res = metric(*bv1tmp, bv2);
38  delete bv1tmp;
39  } else if (bv2.getNumBits() > bv1.getNumBits()) {
40  T* bv2tmp = FoldFingerprint(bv2, bv2.getNumBits() / bv1.getNumBits());
41  res = metric(bv1, *bv2tmp);
42  delete bv2tmp;
43  } else {
44  res = metric(bv1, bv2);
45  }
46  if (returnDistance) res = 1.0 - res;
47  return res;
48 }
49 //! \overload
50 template <typename T>
51 double SimilarityWrapper(const T& bv1, const T& bv2, double a, double b,
52  double (*metric)(const T&, const T&, double, double),
53  bool returnDistance = false) {
54  double res = 0.0;
55  if (bv1.getNumBits() > bv2.getNumBits()) {
56  T* bv1tmp = FoldFingerprint(bv1, bv1.getNumBits() / bv2.getNumBits());
57  res = metric(*bv1tmp, bv2, a, b);
58  delete bv1tmp;
59  } else if (bv2.getNumBits() > bv1.getNumBits()) {
60  T* bv2tmp = FoldFingerprint(bv2, bv2.getNumBits() / bv1.getNumBits());
61  res = metric(bv1, *bv2tmp, a, b);
62  delete bv2tmp;
63  } else {
64  res = metric(bv1, bv2, a, b);
65  }
66  if (returnDistance) res = 1.0 - res;
67  return res;
68 }
69 
70 RDKIT_DATASTRUCTS_EXPORT bool AllProbeBitsMatch(const char* probe,
71  const char* ref);
72 RDKIT_DATASTRUCTS_EXPORT bool AllProbeBitsMatch(const std::string& probe,
73  const std::string& ref);
75  const ExplicitBitVect& ref);
76 
77 template <typename T1>
78 RDKIT_DATASTRUCTS_EXPORT bool AllProbeBitsMatch(const T1& probe,
79  const std::string& pkl);
80 
81 template <typename T1>
82 RDKIT_DATASTRUCTS_EXPORT bool AllProbeBitsMatch(const T1& probe, const T1& ref);
83 
84 //! returns the number of on bits in common between two bit vectors
85 /*!
86  \return (bv1&bv2)_o
87 */
88 template <typename T1, typename T2>
89 RDKIT_DATASTRUCTS_EXPORT int NumOnBitsInCommon(const T1& bv1, const T2& bv2);
90 
92  const ExplicitBitVect& bv2);
93 
94 //! returns the Tanimoto similarity between two bit vects
95 /*!
96  \return <tt>(bv1&bv2)_o / [bv1_o + bv2_o - (bv1&bv2)_o]</tt>
97 */
98 template <typename T1, typename T2>
99 RDKIT_DATASTRUCTS_EXPORT double TanimotoSimilarity(const T1& bv1,
100  const T2& bv2);
101 
102 //! returns the Cosine similarity between two bit vects
103 /*!
104  \return <tt>(bv1&bv2)_o / sqrt(bv1_o + bv2_o)</tt>
105 */
106 template <typename T1, typename T2>
107 RDKIT_DATASTRUCTS_EXPORT double CosineSimilarity(const T1& bv1, const T2& bv2);
108 
109 //! returns the Kulczynski similarity between two bit vects
110 /*!
111  \return <tt>(bv1&bv2)_o * [bv1_o + bv2_o] / [2 * bv1_o * bv2_o]</tt>
112 */
113 template <typename T1, typename T2>
115  const T2& bv2);
116 
117 //! returns the Dice similarity between two bit vects
118 /*!
119  \return <tt>2*(bv1&bv2)_o / [bv1_o + bv2_o]</tt>
120 */
121 template <typename T1, typename T2>
122 RDKIT_DATASTRUCTS_EXPORT double DiceSimilarity(const T1& bv1, const T2& bv2);
123 
124 //! returns the Tversky similarity between two bit vects
125 /*!
126  \return <tt>(bv1&bv2)_o / [a*bv1_o + b*bv2_o + (1 - a - b)*(bv1&bv2)_o]</tt>
127 
128  Notes:
129  # 0 <= a,b <= 1
130  # Tversky(a=1,b=1) = Tanimoto
131  # Tversky(a=1/2,b=1/2) = Dice
132 
133 */
134 template <typename T1, typename T2>
135 RDKIT_DATASTRUCTS_EXPORT double TverskySimilarity(const T1& bv1, const T2& bv2,
136  double a, double b);
137 
138 //! returns the Sokal similarity between two bit vects
139 /*!
140  \return <tt>(bv1&bv2)_o / [2*bv1_o + 2*bv2_o - 3*(bv1&bv2)_o]</tt>
141 */
142 template <typename T1, typename T2>
143 RDKIT_DATASTRUCTS_EXPORT double SokalSimilarity(const T1& bv1, const T2& bv2);
144 
145 //! returns the McConnaughey similarity between two bit vects
146 /*!
147  \return <tt>[(bv1&bv2)_o * (bv1_o + bv2_o) - (bv1_o * bv2_o)] / (bv1_o *
148  bv2_o)</tt>
149 */
150 template <typename T1, typename T2>
152  const T2& bv2);
153 
154 //! returns the Asymmetric similarity between two bit vects
155 /*!
156  \return <tt>(bv1&bv2)_o / min(bv1_o,bv2_o)</tt>
157 */
158 template <typename T1, typename T2>
160  const T2& bv2);
161 
162 //! returns the Braun-Blanquet similarity between two bit vects
163 /*!
164  \return <tt>(bv1&bv2)_o / max(bv1_o,bv2_o)</tt>
165 */
166 template <typename T1, typename T2>
168  const T2& bv2);
169 
170 //! returns the Russel similarity between two bit vects
171 /*!
172  \return <tt>(bv1&bv2)_o / bv1_o</tt>
173 
174  <b>Note:</b> that this operation is non-commutative:
175  RusselSimilarity(bv1,bv2) != RusselSimilarity(bv2,bv1)
176 
177 */
178 template <typename T1, typename T2>
179 RDKIT_DATASTRUCTS_EXPORT double RusselSimilarity(const T1& bv1, const T2& bv2);
180 
181 //! returns the Rogot-Goldberg similarity between two bit vects
182 /*!
183  \return <tt>(bv1&bv2)_o / (bv1_o + bv2_o)
184  + (bv1_n - bv1_o - bv2_o + (bv1&bv2)_o) / (2*bv1_n - bv1_o - bv2_o) </tt>
185 */
186 template <typename T1, typename T2>
188  const T2& bv2);
189 
190 //! returns the on bit similarity between two bit vects
191 /*!
192  \return <tt>(bv1&bv2)_o / (bv1|bv2)_o </tt>
193 */
194 template <typename T1, typename T2>
195 RDKIT_DATASTRUCTS_EXPORT double OnBitSimilarity(const T1& bv1, const T2& bv2);
196 
197 //! returns the number of common bits (on and off) between two bit vects
198 /*!
199  \return <tt>bv1_n - (bv1^bv2)_o</tt>
200 */
201 template <typename T1, typename T2>
202 RDKIT_DATASTRUCTS_EXPORT int NumBitsInCommon(const T1& bv1, const T2& bv2);
203 
205  const ExplicitBitVect& bv2);
206 
207 //! returns the common-bit similarity (on and off) between two bit vects
208 //! This is also called Manhattan similarity.
209 /*!
210  \return <tt>[bv1_n - (bv1^bv2)_o] / bv1_n</tt>
211 */
212 template <typename T1, typename T2>
213 RDKIT_DATASTRUCTS_EXPORT double AllBitSimilarity(const T1& bv1, const T2& bv2);
214 
215 //! returns an IntVect with indices of all on bits in common between two bit
216 // vects
217 template <typename T1, typename T2>
218 RDKIT_DATASTRUCTS_EXPORT IntVect OnBitsInCommon(const T1& bv1, const T2& bv2);
219 
220 //! returns an IntVect with indices of all off bits in common between two bit
221 // vects
222 template <typename T1, typename T2>
223 RDKIT_DATASTRUCTS_EXPORT IntVect OffBitsInCommon(const T1& bv1, const T2& bv2);
224 
225 //! returns the on-bit projected similarities between two bit vects
226 /*!
227  \return two values, as a DoubleVect:
228  - <tt>(bv1&bv2)_o / bv1_o</tt>
229  - <tt>(bv1&bv2)_o / bv2_o</tt>
230 */
231 template <typename T1, typename T2>
233  const T2& bv2);
234 
235 //! returns the on-bit projected similarities between two bit vects
236 /*!
237  \return two values, as a DoubleVect:
238  - <tt>[bv1_n - (bv1|bv2)_o] / [bv1_n - bv1_o]</tt>
239  - <tt>[bv2_n - (bv1|bv2)_o] / [bv2_n - bv2_o]</tt>
240 
241  <b>Note:</b> <tt>bv1_n = bv2_n</tt>
242 
243 */
244 template <typename T1, typename T2>
246  const T2& bv2);
247 
248 //! folds a bit vector \c factor times and returns the result
249 /*!
250  \param bv1 the vector to be folded
251  \param factor (optional) the number of times to fold it
252 
253  \return a pointer to the folded fingerprint, which is
254  <tt>bv1_n/factor</tt> long.
255 
256  <b>Note:</b> The caller is responsible for <tt>delete</tt>ing the result.
257  */
258 template <typename T1>
259 RDKIT_DATASTRUCTS_EXPORT T1* FoldFingerprint(const T1& bv1,
260  unsigned int factor = 2);
261 
262 //! returns a text representation of a bit vector (a string of 0s and 1s)
263 /*!
264  \param bv1 the vector to use
265 
266  \return an std::string
267 
268  */
269 template <typename T1>
270 RDKIT_DATASTRUCTS_EXPORT std::string BitVectToText(const T1& bv1);
271 
272 //! returns a hex representation of a bit vector compatible with Andrew Dalke's
273 // FPS format
274 /*!
275  \param bv1 the vector to use
276 
277  \return an std::string
278 
279  */
280 template <typename T1>
281 RDKIT_DATASTRUCTS_EXPORT std::string BitVectToFPSText(const T1& bv1);
282 
283 //! returns a binary string representation of a bit vector (an array of bytes)
284 /*!
285  \param bv1 the vector to use
286 
287  \return an std::string
288 
289  */
290 template <typename T1>
291 RDKIT_DATASTRUCTS_EXPORT std::string BitVectToBinaryText(const T1& bv1);
292 
293 //! updates a bit vector from Andrew Dalke's FPS format
294 /*!
295  \param bv1 the vector to use
296  \param fps the FPS hex string
297 
298 
299  */
300 template <typename T1>
302  const std::string& fps);
303 
304 //! updates a bit vector from a binary string representation of a bit vector (an
305 // array of bytes)
306 /*!
307  \param bv1 the vector to use
308  \param fps the binary string
309 
310 
311  */
312 template <typename T1>
314  T1& bv1, const std::string& fps);
315 
316 // FIX: docs and tests please
317 
319  const unsigned char* bv1, unsigned int nBytes);
320 
321 RDKIT_DATASTRUCTS_EXPORT double CalcBitmapTanimoto(const unsigned char* bv1,
322  const unsigned char* bv2,
323  unsigned int nBytes);
324 RDKIT_DATASTRUCTS_EXPORT double CalcBitmapDice(const unsigned char* bv1,
325  const unsigned char* bv2,
326  unsigned int nBytes);
327 RDKIT_DATASTRUCTS_EXPORT double CalcBitmapTversky(const unsigned char* bv1,
328  const unsigned char* bv2,
329  unsigned int nBytes,
330  double ca, double cb);
332  const unsigned char* probe, const unsigned char* ref, unsigned int nBytes);
333 #endif
SimilarityWrapper
double SimilarityWrapper(const T &bv1, const T &bv2, double(*metric)(const T &, const T &), bool returnDistance=false)
Definition: BitOps.h:31
DiceSimilarity
RDKIT_DATASTRUCTS_EXPORT double DiceSimilarity(const T1 &bv1, const T2 &bv2)
returns the Dice similarity between two bit vects
OffBitProjSimilarity
RDKIT_DATASTRUCTS_EXPORT DoubleVect OffBitProjSimilarity(const T1 &bv1, const T2 &bv2)
returns the on-bit projected similarities between two bit vects
BraunBlanquetSimilarity
RDKIT_DATASTRUCTS_EXPORT double BraunBlanquetSimilarity(const T1 &bv1, const T2 &bv2)
returns the Braun-Blanquet similarity between two bit vects
FoldFingerprint
RDKIT_DATASTRUCTS_EXPORT T1 * FoldFingerprint(const T1 &bv1, unsigned int factor=2)
folds a bit vector factor times and returns the result
NumOnBitsInCommon
RDKIT_DATASTRUCTS_EXPORT int NumOnBitsInCommon(const T1 &bv1, const T2 &bv2)
returns the number of on bits in common between two bit vectors
BitVectToText
RDKIT_DATASTRUCTS_EXPORT std::string BitVectToText(const T1 &bv1)
returns a text representation of a bit vector (a string of 0s and 1s)
CalcBitmapDice
RDKIT_DATASTRUCTS_EXPORT double CalcBitmapDice(const unsigned char *bv1, const unsigned char *bv2, unsigned int nBytes)
McConnaugheySimilarity
RDKIT_DATASTRUCTS_EXPORT double McConnaugheySimilarity(const T1 &bv1, const T2 &bv2)
returns the McConnaughey similarity between two bit vects
RDKIT_DATASTRUCTS_EXPORT
#define RDKIT_DATASTRUCTS_EXPORT
Definition: export.h:112
RusselSimilarity
RDKIT_DATASTRUCTS_EXPORT double RusselSimilarity(const T1 &bv1, const T2 &bv2)
returns the Russel similarity between two bit vects
AllProbeBitsMatch
RDKIT_DATASTRUCTS_EXPORT bool AllProbeBitsMatch(const char *probe, const char *ref)
AsymmetricSimilarity
RDKIT_DATASTRUCTS_EXPORT double AsymmetricSimilarity(const T1 &bv1, const T2 &bv2)
returns the Asymmetric similarity between two bit vects
CalcBitmapPopcount
RDKIT_DATASTRUCTS_EXPORT unsigned int CalcBitmapPopcount(const unsigned char *bv1, unsigned int nBytes)
OffBitsInCommon
RDKIT_DATASTRUCTS_EXPORT IntVect OffBitsInCommon(const T1 &bv1, const T2 &bv2)
returns an IntVect with indices of all off bits in common between two bit
TanimotoSimilarity
RDKIT_DATASTRUCTS_EXPORT double TanimotoSimilarity(const T1 &bv1, const T2 &bv2)
returns the Tanimoto similarity between two bit vects
CalcBitmapTanimoto
RDKIT_DATASTRUCTS_EXPORT double CalcBitmapTanimoto(const unsigned char *bv1, const unsigned char *bv2, unsigned int nBytes)
OnBitsInCommon
RDKIT_DATASTRUCTS_EXPORT IntVect OnBitsInCommon(const T1 &bv1, const T2 &bv2)
returns an IntVect with indices of all on bits in common between two bit
KulczynskiSimilarity
RDKIT_DATASTRUCTS_EXPORT double KulczynskiSimilarity(const T1 &bv1, const T2 &bv2)
returns the Kulczynski similarity between two bit vects
DoubleVect
std::vector< double > DoubleVect
Definition: BitVect.h:19
UpdateBitVectFromBinaryText
RDKIT_DATASTRUCTS_EXPORT void UpdateBitVectFromBinaryText(T1 &bv1, const std::string &fps)
updates a bit vector from a binary string representation of a bit vector (an
UpdateBitVectFromFPSText
RDKIT_DATASTRUCTS_EXPORT void UpdateBitVectFromFPSText(T1 &bv1, const std::string &fps)
updates a bit vector from Andrew Dalke's FPS format
OnBitProjSimilarity
RDKIT_DATASTRUCTS_EXPORT DoubleVect OnBitProjSimilarity(const T1 &bv1, const T2 &bv2)
returns the on-bit projected similarities between two bit vects
IntVect
std::vector< int > IntVect
Definition: BitVect.h:17
OnBitSimilarity
RDKIT_DATASTRUCTS_EXPORT double OnBitSimilarity(const T1 &bv1, const T2 &bv2)
returns the on bit similarity between two bit vects
AllBitSimilarity
RDKIT_DATASTRUCTS_EXPORT double AllBitSimilarity(const T1 &bv1, const T2 &bv2)
CosineSimilarity
RDKIT_DATASTRUCTS_EXPORT double CosineSimilarity(const T1 &bv1, const T2 &bv2)
returns the Cosine similarity between two bit vects
NumBitsInCommon
RDKIT_DATASTRUCTS_EXPORT int NumBitsInCommon(const T1 &bv1, const T2 &bv2)
returns the number of common bits (on and off) between two bit vects
BitVects.h
Pulls in all the BitVect classes.
TverskySimilarity
RDKIT_DATASTRUCTS_EXPORT double TverskySimilarity(const T1 &bv1, const T2 &bv2, double a, double b)
returns the Tversky similarity between two bit vects
SokalSimilarity
RDKIT_DATASTRUCTS_EXPORT double SokalSimilarity(const T1 &bv1, const T2 &bv2)
returns the Sokal similarity between two bit vects
BitVectToBinaryText
RDKIT_DATASTRUCTS_EXPORT std::string BitVectToBinaryText(const T1 &bv1)
returns a binary string representation of a bit vector (an array of bytes)
RogotGoldbergSimilarity
RDKIT_DATASTRUCTS_EXPORT double RogotGoldbergSimilarity(const T1 &bv1, const T2 &bv2)
returns the Rogot-Goldberg similarity between two bit vects
CalcBitmapAllProbeBitsMatch
RDKIT_DATASTRUCTS_EXPORT bool CalcBitmapAllProbeBitsMatch(const unsigned char *probe, const unsigned char *ref, unsigned int nBytes)
CalcBitmapTversky
RDKIT_DATASTRUCTS_EXPORT double CalcBitmapTversky(const unsigned char *bv1, const unsigned char *bv2, unsigned int nBytes, double ca, double cb)
ExplicitBitVect
a class for bit vectors that are densely occupied
Definition: ExplicitBitVect.h:29
export.h
BitVectToFPSText
RDKIT_DATASTRUCTS_EXPORT std::string BitVectToFPSText(const T1 &bv1)
returns a hex representation of a bit vector compatible with Andrew Dalke's