RDKit
Open-source cheminformatics and machine learning.
LeaderPicker.h
Go to the documentation of this file.
1 //
2 // Copyright (C) 2003-2007 Greg Landrum and Rational Discovery LLC
3 // Copyright (C) 2017-2019 Greg Landrum and NextMove Software
4 //
5 // @@ All Rights Reserved @@
6 // This file is part of the RDKit.
7 // The contents are covered by the terms of the BSD license
8 // which is included in the file license.txt, found at the root
9 // of the RDKit source tree.
10 //
11 #include <RDGeneral/export.h>
12 #ifndef RD_LEADERPICKER_H
13 #define RD_LEADERPICKER_H
14 
15 #include <RDGeneral/types.h>
16 #include <RDGeneral/utils.h>
17 #include <RDGeneral/Invariant.h>
18 #include <RDGeneral/RDLog.h>
19 #include <RDGeneral/Exceptions.h>
20 #include <cstdlib>
21 #include "DistPicker.h"
22 
23 namespace RDPickers {
24 
25 /*! \brief Implements the Leader algorithm for picking a subset of item from a
26  *pool
27  *
28  * This class inherits from the DistPicker and implements a specific picking
29  *strategy aimed at diversity. See documentation for "pick()" member function
30  *for the algorithm details
31  */
33  public:
36 
37  /*! \brief Default Constructor
38  *
39  */
40  LeaderPicker() : default_threshold(0.0), default_nthreads(1) {}
41  LeaderPicker(double threshold)
42  : default_threshold(threshold), default_nthreads(1) {}
43  LeaderPicker(double threshold, int nthreads)
44  : default_threshold(threshold), default_nthreads(nthreads) {}
45 
46  /*! \brief Contains the implementation for a lazy Leader diversity picker
47  *
48  * See the documentation for the pick() method for details about the algorithm
49  *
50  * \param func - a function (or functor) taking two unsigned ints as
51  *arguments and returning the distance (as a double) between those two
52  *elements.
53  *
54  * \param poolSize - the size of the pool to pick the items from. It is
55  *assumed that the distance matrix above contains the right number of
56  *elements; i.e. poolSize*(poolSize-1)
57  *
58  * \param pickSize - the number items to pick from pool (<= poolSize)
59  *
60  * \param firstPicks - (optional)the first items in the pick list
61  *
62  * \param seed - (optional) seed for the random number generator. If this is
63  *<0 the generator will be seeded with a random number.
64  */
65  template <typename T>
66  RDKit::INT_VECT lazyPick(T &func, unsigned int poolSize,
67  unsigned int pickSize) const;
68 
69  template <typename T>
70  RDKit::INT_VECT lazyPick(T &func, unsigned int poolSize,
71  unsigned int pickSize, double threshold) const;
72 
73  template <typename T>
74  RDKit::INT_VECT lazyPick(T &func, unsigned int poolSize,
75  unsigned int pickSize,
76  const RDKit::INT_VECT &firstPicks,
77  double threshold) const;
78 
79  template <typename T>
80  RDKit::INT_VECT lazyPick(T &func, unsigned int poolSize,
81  unsigned int pickSize,
82  const RDKit::INT_VECT &firstPicks, double threshold,
83  int nthreads) const;
84 
85  /*! \brief Contains the implementation for the Leader diversity picker
86  *
87  * \param distMat - distance matrix - a vector of double. It is assumed that
88  *only the lower triangle element of the matrix are supplied in a 1D array\n
89  *
90  * \param poolSize - the size of the pool to pick the items from. It is
91  *assumed that the distance matrix above contains the right number of
92  *elements; i.e. poolSize*(poolSize-1) \n
93  *
94  * \param pickSize - maximum number items to pick from pool (<= poolSize)
95  *
96  * \param firstPicks - indices of the items used to seed the pick set.
97  */
98  RDKit::INT_VECT pick(const double *distMat, unsigned int poolSize,
99  unsigned int pickSize, const RDKit::INT_VECT &firstPicks,
100  double threshold, int nthreads) const {
101  CHECK_INVARIANT(distMat, "Invalid Distance Matrix");
102  if (!poolSize) throw ValueErrorException("empty pool to pick from");
103  if (poolSize < pickSize)
104  throw ValueErrorException("pickSize cannot be larger than the poolSize");
105  distmatFunctor functor(distMat);
106  return this->lazyPick(functor, poolSize, pickSize, firstPicks, threshold,
107  nthreads);
108  }
109 
110  /*! \overload */
111  RDKit::INT_VECT pick(const double *distMat, unsigned int poolSize,
112  unsigned int pickSize) const {
113  RDKit::INT_VECT iv;
114  return pick(distMat, poolSize, pickSize, iv, default_threshold,
115  default_nthreads);
116  }
117 };
118 
119 #ifdef USE_THREADED_LEADERPICKER
120 // Note that this block of code currently only works on linux (which is why it's disabled by default)
121 // We will revisit this during the 2020.03 release cycle in order to get a multi-threaded version of
122 // the LeaderPicker that works on all supported platforms
123 template <typename T>
124 void *LeaderPickerWork(void *arg);
125 
126 template <typename T>
127 struct LeaderPickerState {
128  typedef struct {
129  int *ptr;
130  unsigned int capacity;
131  unsigned int len;
132  unsigned int next[2];
133  } LeaderPickerBlock;
134  typedef struct {
135  LeaderPickerState<T> *stat;
136  pthread_t tid;
137  unsigned int id;
138  } LeaderPickerThread;
139 
140  std::vector<LeaderPickerThread> threads;
141  std::vector<LeaderPickerBlock> blocks;
142  pthread_barrier_t wait;
143  pthread_barrier_t done;
144  std::vector<int> v;
145  LeaderPickerBlock *head_block;
146  unsigned int thread_op;
147  unsigned int nthreads;
148  unsigned int tick;
149  double threshold;
150  int query;
151  T *func;
152 
153  LeaderPickerState(unsigned int count, int nt) {
154  v.resize(count);
155  for (unsigned int i = 0; i < count; i++) v[i] = i;
156 
157  // InitializeBlocks
158  unsigned int bcount;
159  unsigned int bsize;
160  if (nt > 1) {
161  bsize = 4096;
162  bcount = (count + (bsize - 1)) / bsize;
163  unsigned int tasks = (bcount + 1) / 2;
164  // limit number of threads to available work
165  if (nt > (int)tasks) nt = tasks;
166  } else {
167  bsize = 32768;
168  bcount = (count + (bsize - 1)) / bsize;
169  }
170  blocks.resize(bcount);
171  head_block = &blocks[0];
172  tick = 0;
173 
174  if (bcount > 1) {
175  int *ptr = &v[0];
176  unsigned int len = count;
177  for (unsigned int i = 0; i < bcount; i++) {
178  LeaderPickerBlock *block = &blocks[i];
179  block->ptr = ptr;
180  if (len > bsize) {
181  block->capacity = bsize;
182  block->len = bsize;
183  block->next[0] = i + 1;
184  } else {
185  block->capacity = len;
186  block->len = len;
187  block->next[0] = 0;
188  break;
189  }
190  ptr += bsize;
191  len -= bsize;
192  }
193  } else {
194  head_block->capacity = count;
195  head_block->len = count;
196  head_block->next[0] = 0;
197  head_block->next[1] = 0;
198  head_block->ptr = &v[0];
199  }
200 
201  // InitializeThreads
202  if (nt > 1) {
203  nthreads = nt;
204  pthread_barrier_init(&wait, NULL, nthreads + 1);
205  pthread_barrier_init(&done, NULL, nthreads + 1);
206 
207  threads.resize(nt);
208  for (unsigned int i = 0; i < nthreads; i++) {
209  threads[i].id = i;
210  threads[i].stat = this;
211  pthread_create(&threads[i].tid, NULL, LeaderPickerWork<T>,
212  (void *)&threads[i]);
213  }
214  } else
215  nthreads = 1;
216  }
217 
218  ~LeaderPickerState() {
219  if (nthreads > 1) {
220  thread_op = 1;
221  pthread_barrier_wait(&wait);
222  for (unsigned int i = 0; i < nthreads; i++)
223  pthread_join(threads[i].tid, 0);
224  pthread_barrier_destroy(&wait);
225  pthread_barrier_destroy(&done);
226  }
227  }
228 
229  bool empty() {
230  while (head_block) {
231  if (head_block->len) return false;
232  unsigned int next_tick = head_block->next[tick];
233  if (!next_tick) return true;
234  head_block = &blocks[next_tick];
235  }
236  return true;
237  }
238 
239  unsigned int compact(int *dst, int *src, unsigned int len) {
240  unsigned int count = 0;
241  for (unsigned int i = 0; i < len; i++) {
242  if ((*func)(query, src[i]) > threshold) dst[count++] = src[i];
243  }
244  return count;
245  }
246 
247  void compact_job(unsigned int cycle) {
248  // On entry, next[tick] for each block is the current linked list.
249  // On exit, next[tock] is the linked list for the next iteration.
250  unsigned int tock = tick ^ 1;
251 
252  LeaderPickerBlock *list = head_block;
253  for (;;) {
254  unsigned int next_tick = list->next[tick];
255  if (next_tick) {
256  LeaderPickerBlock *next = &blocks[next_tick];
257  unsigned int next_next_tick = next->next[tick];
258  if (cycle == 0) {
259  list->len = compact(list->ptr, list->ptr, list->len);
260  if (list->len + next->len <= list->capacity) {
261  list->len += compact(list->ptr + list->len, next->ptr, next->len);
262  list->next[tock] = next_next_tick;
263  } else {
264  next->len = compact(next->ptr, next->ptr, next->len);
265  if (next->len) {
266  list->next[tock] = next_tick;
267  next->next[tock] = next_next_tick;
268  } else
269  list->next[tock] = next_next_tick;
270  }
271  cycle = nthreads - 1;
272  } else
273  cycle--;
274  if (next_next_tick) {
275  list = &blocks[next_next_tick];
276  } else
277  break;
278  } else {
279  if (cycle == 0) {
280  list->len = compact(list->ptr, list->ptr, list->len);
281  list->next[tock] = 0;
282  }
283  break;
284  }
285  }
286  }
287 
288  void compact(int pick) {
289  query = pick;
290  if (nthreads > 1) {
291  thread_op = 0;
292  pthread_barrier_wait(&wait);
293  pthread_barrier_wait(&done);
294  } else
295  compact_job(0);
296  tick ^= 1;
297  }
298 
299  int compact_next() {
300  compact(head_block->ptr[0]);
301  return query;
302  }
303 };
304 
305 // This is the loop the worker threads run
306 template <typename T>
307 void *LeaderPickerWork(void *arg) {
308  typename LeaderPickerState<T>::LeaderPickerThread *thread;
309  thread = (typename LeaderPickerState<T>::LeaderPickerThread *)arg;
310  LeaderPickerState<T> *stat = thread->stat;
311 
312  for (;;) {
313  pthread_barrier_wait(&stat->wait);
314  if (stat->thread_op) return (void *)0;
315  stat->compact_job(thread->id);
316  pthread_barrier_wait(&stat->done);
317  }
318 }
319 #else
320 
321 template <typename T>
323  std::vector<int> v;
324  unsigned int left;
325  double threshold;
326  int query;
327  T *func;
328 
329  LeaderPickerState(unsigned int count, int) {
330  v.resize(count);
331  for (unsigned int i = 0; i < count; i++) v[i] = i;
332  left = count;
333  }
334 
335  bool empty() { return left == 0; }
336 
337  unsigned int compact(int *dst, int *src, unsigned int len) {
338  unsigned int count = 0;
339  for (unsigned int i = 0; i < len; i++) {
340  double ld = (*func)(query, src[i]);
341  // std::cerr << query << "-" << src[i] << " " << ld << std::endl;
342  if (ld > threshold) dst[count++] = src[i];
343  }
344  return count;
345  }
346 
347  void compact(int pick) {
348  query = pick;
349  left = compact(&v[0], &v[0], left);
350  }
351 
352  int compact_next() {
353  query = v[0];
354  left = compact(&v[0], &v[1], left - 1);
355  return query;
356  }
357 };
358 
359 #endif
360 // we implement this here in order to allow arbitrary functors without link
361 // errors
362 template <typename T>
363 RDKit::INT_VECT LeaderPicker::lazyPick(T &func, unsigned int poolSize,
364  unsigned int pickSize,
365  const RDKit::INT_VECT &firstPicks,
366  double threshold, int nthreads) const {
367  if (!poolSize) throw ValueErrorException("empty pool to pick from");
368 
369  if (poolSize < pickSize)
370  throw ValueErrorException("pickSize cannot be larger than the poolSize");
371 
372  if (!pickSize) pickSize = poolSize;
373  RDKit::INT_VECT picks;
374 
375  LeaderPickerState<T> stat(poolSize, nthreads);
376  stat.threshold = threshold;
377  stat.func = &func;
378 
379  unsigned int picked = 0; // picks.size()
380  unsigned int pick = 0;
381 
382  if (!firstPicks.empty()) {
383  for (RDKit::INT_VECT::const_iterator pIdx = firstPicks.begin();
384  pIdx != firstPicks.end(); ++pIdx) {
385  pick = static_cast<unsigned int>(*pIdx);
386  if (pick >= poolSize) {
387  throw ValueErrorException("pick index was larger than the poolSize");
388  }
389  picks.push_back(pick);
390  stat.compact(pick);
391  picked++;
392  }
393  }
394 
395  while (picked < pickSize && !stat.empty()) {
396  pick = stat.compact_next();
397  picks.push_back(pick);
398  picked++;
399  }
400  return picks;
401 }
402 
403 template <typename T>
404 RDKit::INT_VECT LeaderPicker::lazyPick(T &func, unsigned int poolSize,
405  unsigned int pickSize) const {
406  RDKit::INT_VECT firstPicks;
407  return LeaderPicker::lazyPick(func, poolSize, pickSize, firstPicks,
409 }
410 
411 template <typename T>
412 RDKit::INT_VECT LeaderPicker::lazyPick(T &func, unsigned int poolSize,
413  unsigned int pickSize,
414  double threshold) const {
415  RDKit::INT_VECT firstPicks;
416  return LeaderPicker::lazyPick(func, poolSize, pickSize, firstPicks, threshold,
418 }
419 template <typename T>
420 RDKit::INT_VECT LeaderPicker::lazyPick(T &func, unsigned int poolSize,
421  unsigned int pickSize,
422  const RDKit::INT_VECT &firstPicks,
423  double threshold) const {
424  return LeaderPicker::lazyPick(func, poolSize, pickSize, firstPicks, threshold,
426 }
427 
428 }; // namespace RDPickers
429 
430 #endif
RDKit::INT_VECT
std::vector< int > INT_VECT
Definition: types.h:254
RDPickers::LeaderPickerState::compact
void compact(int pick)
Definition: LeaderPicker.h:347
types.h
RDPickers::LeaderPickerState::empty
bool empty()
Definition: LeaderPicker.h:335
RDPickers::LeaderPickerState
Definition: LeaderPicker.h:322
RDPickers::LeaderPicker::LeaderPicker
LeaderPicker(double threshold, int nthreads)
Definition: LeaderPicker.h:43
RDPickers::LeaderPicker::pick
RDKit::INT_VECT pick(const double *distMat, unsigned int poolSize, unsigned int pickSize, const RDKit::INT_VECT &firstPicks, double threshold, int nthreads) const
Contains the implementation for the Leader diversity picker.
Definition: LeaderPicker.h:98
CHECK_INVARIANT
#define CHECK_INVARIANT(expr, mess)
Definition: Invariant.h:101
RDPickers::LeaderPicker::default_threshold
double default_threshold
Definition: LeaderPicker.h:34
RDPickers::LeaderPicker::LeaderPicker
LeaderPicker()
Default Constructor.
Definition: LeaderPicker.h:40
RDKIT_SIMDIVPICKERS_EXPORT
#define RDKIT_SIMDIVPICKERS_EXPORT
Definition: export.h:619
RDPickers::LeaderPickerState::threshold
double threshold
Definition: LeaderPicker.h:325
RDPickers::LeaderPickerState::LeaderPickerState
LeaderPickerState(unsigned int count, int)
Definition: LeaderPicker.h:329
utils.h
RDPickers::LeaderPickerState::left
unsigned int left
Definition: LeaderPicker.h:324
RDPickers::LeaderPicker::lazyPick
RDKit::INT_VECT lazyPick(T &func, unsigned int poolSize, unsigned int pickSize) const
Contains the implementation for a lazy Leader diversity picker.
Definition: LeaderPicker.h:404
ValueErrorException
Class to allow us to throw a ValueError from C++ and have it make it back to Python.
Definition: Exceptions.h:33
Invariant.h
RDPickers::LeaderPicker::default_nthreads
int default_nthreads
Definition: LeaderPicker.h:35
RDPickers::LeaderPickerState::v
std::vector< int > v
Definition: LeaderPicker.h:323
RDPickers::LeaderPicker
Implements the Leader algorithm for picking a subset of item from a pool.
Definition: LeaderPicker.h:32
RDPickers::LeaderPickerState::query
int query
Definition: LeaderPicker.h:326
RDPickers::LeaderPickerState::compact
unsigned int compact(int *dst, int *src, unsigned int len)
Definition: LeaderPicker.h:337
RDPickers::DistPicker
Abstract base class to do perform item picking (typically molecules) using a distance matrix.
Definition: DistPicker.h:46
RDPickers::LeaderPickerState::func
T * func
Definition: LeaderPicker.h:327
RDLog.h
RDPickers::LeaderPicker::LeaderPicker
LeaderPicker(double threshold)
Definition: LeaderPicker.h:41
RDPickers
Definition: DistPicker.h:16
RDPickers::LeaderPicker::pick
RDKit::INT_VECT pick(const double *distMat, unsigned int poolSize, unsigned int pickSize) const
Definition: LeaderPicker.h:111
RDPickers::LeaderPickerState::compact_next
int compact_next()
Definition: LeaderPicker.h:352
Exceptions.h
DistPicker.h
export.h