RDKit
Open-source cheminformatics and machine learning.
MaxMinPicker.h
Go to the documentation of this file.
1 //
2 // Copyright (C) 2003-2007 Greg Landrum and Rational Discovery LLC
3 // Copyright (C) 2017 Greg Landrum and NextMove Software
4 //
5 // @@ All Rights Reserved @@
6 // This file is part of the RDKit.
7 // The contents are covered by the terms of the BSD license
8 // which is included in the file license.txt, found at the root
9 // of the RDKit source tree.
10 //
11 #include <RDGeneral/export.h>
12 #ifndef RD_MAXMINPICKER_H
13 #define RD_MAXMINPICKER_H
14 
15 #include <RDGeneral/types.h>
16 #include <RDGeneral/utils.h>
17 #include <RDGeneral/Invariant.h>
18 #include <RDGeneral/RDLog.h>
19 #include <RDGeneral/Exceptions.h>
20 #include <cstdlib>
21 #include "DistPicker.h"
22 #include <boost/random.hpp>
23 #include <random>
24 
25 namespace RDPickers {
26 
27 /*! \brief Implements the MaxMin algorithm for picking a subset of item from a
28  *pool
29  *
30  * This class inherits from the DistPicker and implements a specific picking
31  *strategy
32  * aimed at diversity. See documentation for "pick()" member function for the
33  *algorithm details
34  */
36  public:
37  /*! \brief Default Constructor
38  *
39  */
41 
42  /*! \brief Contains the implementation for a lazy MaxMin diversity picker
43  *
44  * See the documentation for the pick() method for details about the algorithm
45  *
46  * \param func - a function (or functor) taking two unsigned ints as
47  *arguments
48  * and returning the distance (as a double) between those two
49  *elements.
50  * \param poolSize - the size of the pool to pick the items from. It is
51  *assumed that the
52  * distance matrix above contains the right number of elements;
53  *i.e.
54  * poolSize*(poolSize-1)
55  * \param pickSize - the number items to pick from pool (<= poolSize)
56  * \param firstPicks - (optional)the first items in the pick list
57  * \param seed - (optional) seed for the random number generator.
58  * If this is <0 the generator will be seeded with a
59  * random number.
60  */
61  template <typename T>
62  RDKit::INT_VECT lazyPick(T &func, unsigned int poolSize,
63  unsigned int pickSize) const;
64 
65  template <typename T>
66  RDKit::INT_VECT lazyPick(T &func, unsigned int poolSize,
67  unsigned int pickSize,
68  const RDKit::INT_VECT &firstPicks,
69  int seed = -1) const;
70 
71  template <typename T>
72  RDKit::INT_VECT lazyPick(T &func, unsigned int poolSize,
73  unsigned int pickSize,
74  const RDKit::INT_VECT &firstPicks, int seed,
75  double &threshold) const;
76 
77  /*! \brief Contains the implementation for the MaxMin diversity picker
78  *
79  * Here is how the picking algorithm works, refer to
80  * Ashton, M. et. al., Quant. Struct.-Act. Relat., 21 (2002), 598-604
81  * for more detail:
82  *
83  * A subset of k items is to be selected from a pool containing N molecules.
84  * Then the MaxMin method is as follows:
85  * -# Initialise Subset with some appropriately chosen seed
86  * compound and set x = 1.
87  * -# For each of the N-x remaining compounds in Dataset
88  * calculate its dissimilarity with each of the x compounds in
89  * Subset and retain the smallest of these x dissimilarities for
90  * each compound in Dataset.
91  * -# Select the molecule from Dataset with the largest value
92  * for the smallest dissimilarity calculated in Step 2 and
93  * transfer it to Subset.
94  * -# Set x = x + 1 and return to Step 2 if x < k.
95  *
96  *
97  *
98  * \param distMat - distance matrix - a vector of double. It is assumed that
99  *only the
100  * lower triangle element of the matrix are supplied in a 1D
101  *array\n
102  * \param poolSize - the size of the pool to pick the items from. It is
103  *assumed that the
104  * distance matrix above contains the right number of elements;
105  *i.e.
106  * poolSize*(poolSize-1) \n
107  * \param pickSize - the number items to pick from pool (<= poolSize)
108  * \param firstPicks - indices of the items used to seed the pick set.
109  * \param seed - (optional) seed for the random number generator
110  * If this is <0 the generator will be seeded with a
111  * random number.
112  */
113  RDKit::INT_VECT pick(const double *distMat, unsigned int poolSize,
114  unsigned int pickSize, RDKit::INT_VECT firstPicks,
115  int seed = -1) const {
116  CHECK_INVARIANT(distMat, "Invalid Distance Matrix");
117  if (!poolSize) throw ValueErrorException("empty pool to pick from");
118  if (poolSize < pickSize)
119  throw ValueErrorException("pickSize cannot be larger than the poolSize");
120  distmatFunctor functor(distMat);
121  return this->lazyPick(functor, poolSize, pickSize, firstPicks, seed);
122  }
123 
124  /*! \overload */
125  RDKit::INT_VECT pick(const double *distMat, unsigned int poolSize,
126  unsigned int pickSize) const {
127  RDKit::INT_VECT iv;
128  return pick(distMat, poolSize, pickSize, iv);
129  }
130 };
131 
133  double dist_bound; // distance to closest reference
134  unsigned int picks; // number of references considered
135  unsigned int next; // singly linked list of candidates
136 };
137 
138 // we implement this here in order to allow arbitrary functors without link
139 // errors
140 template <typename T>
141 RDKit::INT_VECT MaxMinPicker::lazyPick(T &func, unsigned int poolSize,
142  unsigned int pickSize,
143  const RDKit::INT_VECT &firstPicks,
144  int seed, double &threshold) const {
145  if (!poolSize) throw ValueErrorException("empty pool to pick from");
146 
147  if (poolSize < pickSize)
148  throw ValueErrorException("pickSize cannot be larger than the poolSize");
149 
150  RDKit::INT_VECT picks;
151 
152  unsigned int memsize = (unsigned int)(poolSize * sizeof(MaxMinPickInfo));
153  MaxMinPickInfo *pinfo = new MaxMinPickInfo[memsize];
154  if (!pinfo) {
155  threshold = -1.0;
156  return picks;
157  }
158  memset(pinfo, 0, memsize);
159 
160  picks.reserve(pickSize);
161  unsigned int picked = 0; // picks.size()
162  unsigned int pick = 0;
163 
164  // pick the first entry
165  if (firstPicks.empty()) {
166  // get a seeded random number generator:
167  typedef boost::mt19937 rng_type;
168  typedef boost::uniform_int<> distrib_type;
169  typedef boost::variate_generator<rng_type &, distrib_type> source_type;
170  rng_type generator;
171  distrib_type dist(0, poolSize - 1);
172  if (seed >= 0) {
173  generator.seed(static_cast<rng_type::result_type>(seed));
174  } else {
175  generator.seed(std::random_device()());
176  }
177  source_type randomSource(generator, dist);
178  pick = randomSource();
179  // add the pick to the picks
180  picks.push_back(pick);
181  // and remove it from the pool
182  pinfo[pick].picks = 1;
183  picked = 1;
184 
185  } else {
186  for (RDKit::INT_VECT::const_iterator pIdx = firstPicks.begin();
187  pIdx != firstPicks.end(); ++pIdx) {
188  pick = static_cast<unsigned int>(*pIdx);
189  if (pick >= poolSize) {
190  delete[] pinfo;
191  throw ValueErrorException("pick index was larger than the poolSize");
192  }
193  picks.push_back(pick);
194  pinfo[pick].picks = 1;
195  picked++;
196  }
197  }
198 
199  if (picked >= pickSize) {
200  threshold = -1.0;
201  delete[] pinfo;
202  return picks;
203  }
204 
205  unsigned int pool_list = 0;
206  unsigned int *prev = &pool_list;
207  // enter the pool into a list so that we can pick out of it easily
208  for (unsigned int i = 0; i < poolSize; i++)
209  if (pinfo[i].picks == 0) {
210  *prev = i;
211  prev = &pinfo[i].next;
212  }
213  *prev = 0;
214 
215  unsigned int poolIdx;
216  unsigned int pickIdx;
217 
218  // First pass initialize dist_bound
219  prev = &pool_list;
220  pickIdx = picks[0];
221  do {
222  poolIdx = *prev;
223  pinfo[poolIdx].dist_bound = func(poolIdx, pickIdx);
224  pinfo[poolIdx].picks = 1;
225  prev = &pinfo[poolIdx].next;
226  } while (*prev != 0);
227 
228  // now pick 1 compound at a time
229  double maxOFmin = -1.0;
230  double tmpThreshold = -1.0;
231  while (picked < pickSize) {
232  unsigned int *pick_prev = 0;
233  maxOFmin = -1.0;
234  prev = &pool_list;
235  do {
236  poolIdx = *prev;
237  double minTOi = pinfo[poolIdx].dist_bound;
238  if (minTOi > maxOFmin) {
239  unsigned int pi = pinfo[poolIdx].picks;
240  while (pi < picked) {
241  unsigned int picki = picks[pi];
242  CHECK_INVARIANT(poolIdx != picki, "pool index != pick index");
243  double dist = func(poolIdx, picki);
244  pi++;
245  if (dist <= minTOi) {
246  minTOi = dist;
247  if (minTOi <= maxOFmin) break;
248  }
249  }
250  pinfo[poolIdx].dist_bound = minTOi;
251  pinfo[poolIdx].picks = pi;
252  if (minTOi > maxOFmin) {
253  maxOFmin = minTOi;
254  pick_prev = prev;
255  pick = poolIdx;
256  }
257  }
258  prev = &pinfo[poolIdx].next;
259  } while (*prev != 0);
260 
261  // if the current distance is closer then threshold, we're done
262  if (maxOFmin <= threshold && threshold >= 0.0) break;
263  tmpThreshold = maxOFmin;
264  // now add the new pick to picks and remove it from the pool
265  *pick_prev = pinfo[pick].next;
266  picks.push_back(pick);
267  picked++;
268  }
269 
270  threshold = tmpThreshold;
271  delete[] pinfo;
272  return picks;
273 }
274 
275 template <typename T>
276 RDKit::INT_VECT MaxMinPicker::lazyPick(T &func, unsigned int poolSize,
277  unsigned int pickSize,
278  const RDKit::INT_VECT &firstPicks,
279  int seed) const {
280  double threshold = -1.0;
281  return MaxMinPicker::lazyPick(func, poolSize, pickSize, firstPicks, seed,
282  threshold);
283 }
284 
285 template <typename T>
286 RDKit::INT_VECT MaxMinPicker::lazyPick(T &func, unsigned int poolSize,
287  unsigned int pickSize) const {
288  RDKit::INT_VECT firstPicks;
289  double threshold = -1.0;
290  int seed = -1;
291  return MaxMinPicker::lazyPick(func, poolSize, pickSize, firstPicks, seed,
292  threshold);
293 }
294 }; // namespace RDPickers
295 
296 #endif
RDKit::INT_VECT
std::vector< int > INT_VECT
Definition: types.h:254
types.h
RDPickers::MaxMinPicker
Implements the MaxMin algorithm for picking a subset of item from a pool.
Definition: MaxMinPicker.h:35
RDPickers::MaxMinPicker::pick
RDKit::INT_VECT pick(const double *distMat, unsigned int poolSize, unsigned int pickSize, RDKit::INT_VECT firstPicks, int seed=-1) const
Contains the implementation for the MaxMin diversity picker.
Definition: MaxMinPicker.h:113
RDPickers::MaxMinPickInfo::next
unsigned int next
Definition: MaxMinPicker.h:135
RDPickers::MaxMinPicker::lazyPick
RDKit::INT_VECT lazyPick(T &func, unsigned int poolSize, unsigned int pickSize) const
Contains the implementation for a lazy MaxMin diversity picker.
Definition: MaxMinPicker.h:286
RDKit::rng_type
boost::minstd_rand rng_type
Definition: utils.h:38
CHECK_INVARIANT
#define CHECK_INVARIANT(expr, mess)
Definition: Invariant.h:101
RDKIT_SIMDIVPICKERS_EXPORT
#define RDKIT_SIMDIVPICKERS_EXPORT
Definition: export.h:619
utils.h
ValueErrorException
Class to allow us to throw a ValueError from C++ and have it make it back to Python.
Definition: Exceptions.h:33
Invariant.h
RDPickers::DistPicker
Abstract base class to do perform item picking (typically molecules) using a distance matrix.
Definition: DistPicker.h:46
RDLog.h
RDPickers
Definition: DistPicker.h:16
RDPickers::MaxMinPicker::pick
RDKit::INT_VECT pick(const double *distMat, unsigned int poolSize, unsigned int pickSize) const
Definition: MaxMinPicker.h:125
RDPickers::MaxMinPickInfo::picks
unsigned int picks
Definition: MaxMinPicker.h:134
RDPickers::MaxMinPickInfo
Definition: MaxMinPicker.h:132
RDPickers::MaxMinPicker::MaxMinPicker
MaxMinPicker()
Default Constructor.
Definition: MaxMinPicker.h:40
Exceptions.h
RDPickers::MaxMinPickInfo::dist_bound
double dist_bound
Definition: MaxMinPicker.h:133
DistPicker.h
export.h