RDKit
Open-source cheminformatics and machine learning.
LeaderPicker.h
Go to the documentation of this file.
1 //
2 // Copyright (C) 2003-2007 Greg Landrum and Rational Discovery LLC
3 // Copyright (C) 2017-2019 Greg Landrum and NextMove Software
4 //
5 // @@ All Rights Reserved @@
6 // This file is part of the RDKit.
7 // The contents are covered by the terms of the BSD license
8 // which is included in the file license.txt, found at the root
9 // of the RDKit source tree.
10 //
11 #ifndef RD_LEADERPICKER_H
12 #define RD_LEADERPICKER_H
13 
14 #include <RDGeneral/types.h>
15 #include <RDGeneral/utils.h>
16 #include <RDGeneral/Invariant.h>
17 #include <RDGeneral/RDLog.h>
18 #include <RDGeneral/Exceptions.h>
19 #include <RDGeneral/RDThreads.h>
20 #include <cstdlib>
21 #include "DistPicker.h"
22 
23 namespace RDPickers {
24 
25 /*! \brief Implements the Leader algorithm for picking a subset of item from a
26  *pool
27  *
28  * This class inherits from the DistPicker and implements a specific picking
29  *strategy aimed at diversity. See documentation for "pick()" member function
30  *for the algorithm details
31  */
32 class LeaderPicker : public DistPicker {
33  public:
34  double default_threshold{0.0};
36 
37  /*! \brief Default Constructor
38  *
39  */
41  LeaderPicker(double threshold)
42  : default_threshold(threshold), default_nthreads(1) {}
43  LeaderPicker(double threshold, int nthreads)
44  : default_threshold(threshold), default_nthreads(nthreads) {}
45 
46  /*! \brief Contains the implementation for a lazy Leader diversity picker
47  *
48  * See the documentation for the pick() method for details about the algorithm
49  *
50  * \param func - a function (or functor) taking two unsigned ints as
51  *arguments and returning the distance (as a double) between those two
52  *elements.
53  *
54  * \param poolSize - the size of the pool to pick the items from. It is
55  *assumed that the distance matrix above contains the right number of
56  *elements; i.e. poolSize*(poolSize-1)
57  *
58  * \param pickSize - the number items to pick from pool (<= poolSize)
59  *
60  * \param firstPicks - (optional)the first items in the pick list
61  *
62  * \param seed - (optional) seed for the random number generator. If this is
63  *<0 the generator will be seeded with a random number.
64  */
65  template <typename T>
66  RDKit::INT_VECT lazyPick(T &func, unsigned int poolSize,
67  unsigned int pickSize) const;
68 
69  template <typename T>
70  RDKit::INT_VECT lazyPick(T &func, unsigned int poolSize,
71  unsigned int pickSize, double threshold) const;
72 
73  template <typename T>
74  RDKit::INT_VECT lazyPick(T &func, unsigned int poolSize,
75  unsigned int pickSize,
76  const RDKit::INT_VECT &firstPicks,
77  double threshold) const;
78 
79  template <typename T>
80  RDKit::INT_VECT lazyPick(T &func, unsigned int poolSize,
81  unsigned int pickSize,
82  const RDKit::INT_VECT &firstPicks, double threshold,
83  int nthreads) const;
84 
85  /*! \brief Contains the implementation for the Leader diversity picker
86  *
87  * \param distMat - distance matrix - a vector of double. It is assumed that
88  *only the lower triangle element of the matrix are supplied in a 1D array\n
89  *
90  * \param poolSize - the size of the pool to pick the items from. It is
91  *assumed that the distance matrix above contains the right number of
92  *elements; i.e. poolSize*(poolSize-1) \n
93  *
94  * \param pickSize - maximum number items to pick from pool (<= poolSize)
95  *
96  * \param firstPicks - indices of the items used to seed the pick set.
97  */
98  RDKit::INT_VECT pick(const double *distMat, unsigned int poolSize,
99  unsigned int pickSize, const RDKit::INT_VECT &firstPicks,
100  double threshold, int nthreads) const {
101  CHECK_INVARIANT(distMat, "Invalid Distance Matrix");
102  if (!poolSize) {
103  throw ValueErrorException("empty pool to pick from");
104  }
105  if (poolSize < pickSize) {
106  throw ValueErrorException("pickSize cannot be larger than the poolSize");
107  }
108  distmatFunctor functor(distMat);
109  return this->lazyPick(functor, poolSize, pickSize, firstPicks, threshold,
110  nthreads);
111  }
112 
113  /*! \overload */
114  RDKit::INT_VECT pick(const double *distMat, unsigned int poolSize,
115  unsigned int pickSize) const override {
116  RDKit::INT_VECT iv;
117  return pick(distMat, poolSize, pickSize, iv, default_threshold,
119  }
120 };
121 
122 #if defined(RDK_BUILD_THREADSAFE_SSS)
123 #if defined(unix) || defined(__unix__) || defined(__unix)
124 #define USE_THREADED_LEADERPICKER
125 #endif
126 #endif
127 
128 #ifdef USE_THREADED_LEADERPICKER
129 // Note that this block of code currently only works on linux (which is why it's
130 // disabled by default elsewhere). In order to work on other platforms we need
131 // cross-platform threading primitives which support a barrier; or a rewrite.
132 // Given that we will get the cross-platform threading for free with C++20, I
133 // think it makes sense to just wait
134 template <typename T>
135 void *LeaderPickerWork(void *arg);
136 
137 template <typename T>
138 struct LeaderPickerState {
139  typedef struct {
140  int *ptr;
141  unsigned int capacity;
142  unsigned int len;
143  unsigned int next[2];
144  } LeaderPickerBlock;
145  typedef struct {
146  LeaderPickerState<T> *stat;
147  pthread_t tid;
148  unsigned int id;
149  } LeaderPickerThread;
150 
151  std::vector<LeaderPickerThread> threads;
152  std::vector<LeaderPickerBlock> blocks;
153  pthread_barrier_t wait;
154  pthread_barrier_t done;
155  std::vector<int> v;
156  LeaderPickerBlock *head_block;
157  unsigned int thread_op;
158  unsigned int nthreads;
159  unsigned int tick;
160  double threshold;
161  int query;
162  T *func;
163 
164  LeaderPickerState(unsigned int count, int nt) {
165  v.resize(count);
166  for (unsigned int i = 0; i < count; i++) {
167  v[i] = i;
168  }
169 
170  // InitializeBlocks
171  unsigned int bcount;
172  unsigned int bsize;
173  if (nt > 1) {
174  bsize = 4096;
175  bcount = (count + (bsize - 1)) / bsize;
176  unsigned int tasks = (bcount + 1) / 2;
177  // limit number of threads to available work
178  if (nt > (int)tasks) {
179  nt = tasks;
180  }
181  } else {
182  bsize = 32768;
183  bcount = (count + (bsize - 1)) / bsize;
184  }
185  blocks.resize(bcount);
186  head_block = &blocks[0];
187  tick = 0;
188 
189  if (bcount > 1) {
190  int *ptr = &v[0];
191  unsigned int len = count;
192  for (unsigned int i = 0; i < bcount; i++) {
193  LeaderPickerBlock *block = &blocks[i];
194  block->ptr = ptr;
195  if (len > bsize) {
196  block->capacity = bsize;
197  block->len = bsize;
198  block->next[0] = i + 1;
199  } else {
200  block->capacity = len;
201  block->len = len;
202  block->next[0] = 0;
203  break;
204  }
205  ptr += bsize;
206  len -= bsize;
207  }
208  } else {
209  head_block->capacity = count;
210  head_block->len = count;
211  head_block->next[0] = 0;
212  head_block->next[1] = 0;
213  head_block->ptr = &v[0];
214  }
215 
216  // InitializeThreads
217  if (nt > 1) {
218  nthreads = nt;
219  pthread_barrier_init(&wait, NULL, nthreads + 1);
220  pthread_barrier_init(&done, NULL, nthreads + 1);
221 
222  threads.resize(nt);
223  for (unsigned int i = 0; i < nthreads; i++) {
224  threads[i].id = i;
225  threads[i].stat = this;
226  pthread_create(&threads[i].tid, NULL, LeaderPickerWork<T>,
227  (void *)&threads[i]);
228  }
229  } else {
230  nthreads = 1;
231  }
232  }
233 
234  ~LeaderPickerState() {
235  if (nthreads > 1) {
236  thread_op = 1;
237  pthread_barrier_wait(&wait);
238  for (unsigned int i = 0; i < nthreads; i++) {
239  pthread_join(threads[i].tid, 0);
240  }
241  pthread_barrier_destroy(&wait);
242  pthread_barrier_destroy(&done);
243  }
244  }
245 
246  bool empty() {
247  while (head_block) {
248  if (head_block->len) {
249  return false;
250  }
251  unsigned int next_tick = head_block->next[tick];
252  if (!next_tick) {
253  return true;
254  }
255  head_block = &blocks[next_tick];
256  }
257  return true;
258  }
259 
260  unsigned int compact(int *dst, int *src, unsigned int len) {
261  unsigned int count = 0;
262  for (unsigned int i = 0; i < len; i++) {
263  if ((*func)(query, src[i]) > threshold) {
264  dst[count++] = src[i];
265  }
266  }
267  return count;
268  }
269 
270  void compact_job(unsigned int cycle) {
271  // On entry, next[tick] for each block is the current linked list.
272  // On exit, next[tock] is the linked list for the next iteration.
273  unsigned int tock = tick ^ 1;
274 
275  LeaderPickerBlock *list = head_block;
276  for (;;) {
277  unsigned int next_tick = list->next[tick];
278  if (next_tick) {
279  LeaderPickerBlock *next = &blocks[next_tick];
280  unsigned int next_next_tick = next->next[tick];
281  if (cycle == 0) {
282  list->len = compact(list->ptr, list->ptr, list->len);
283  if (list->len + next->len <= list->capacity) {
284  list->len += compact(list->ptr + list->len, next->ptr, next->len);
285  list->next[tock] = next_next_tick;
286  } else {
287  next->len = compact(next->ptr, next->ptr, next->len);
288  if (next->len) {
289  list->next[tock] = next_tick;
290  next->next[tock] = next_next_tick;
291  } else {
292  list->next[tock] = next_next_tick;
293  }
294  }
295  cycle = nthreads - 1;
296  } else {
297  cycle--;
298  }
299  if (next_next_tick) {
300  list = &blocks[next_next_tick];
301  } else {
302  break;
303  }
304  } else {
305  if (cycle == 0) {
306  list->len = compact(list->ptr, list->ptr, list->len);
307  list->next[tock] = 0;
308  }
309  break;
310  }
311  }
312  }
313 
314  void compact(int pick) {
315  query = pick;
316  if (nthreads > 1) {
317  thread_op = 0;
318  pthread_barrier_wait(&wait);
319  pthread_barrier_wait(&done);
320  } else {
321  compact_job(0);
322  }
323  tick ^= 1;
324  }
325 
326  int compact_next() {
327  compact(head_block->ptr[0]);
328  return query;
329  }
330 };
331 
332 // This is the loop the worker threads run
333 template <typename T>
334 void *LeaderPickerWork(void *arg) {
335  typename LeaderPickerState<T>::LeaderPickerThread *thread;
336  thread = (typename LeaderPickerState<T>::LeaderPickerThread *)arg;
337  LeaderPickerState<T> *stat = thread->stat;
338 
339  for (;;) {
340  pthread_barrier_wait(&stat->wait);
341  if (stat->thread_op) {
342  return (void *)0;
343  }
344  stat->compact_job(thread->id);
345  pthread_barrier_wait(&stat->done);
346  }
347 }
348 #else
349 
350 template <typename T>
352  std::vector<int> v;
353  unsigned int left;
354  double threshold;
355  int query;
356  T *func;
357 
358  LeaderPickerState(unsigned int count, int)
359  : left(count), threshold(0.0), query(0), func(nullptr) {
360  v.resize(count);
361  for (unsigned int i = 0; i < count; i++) {
362  v[i] = i;
363  }
364  }
365 
366  bool empty() { return left == 0; }
367 
368  unsigned int compact(int *dst, int *src, unsigned int len) {
369  unsigned int count = 0;
370  for (unsigned int i = 0; i < len; i++) {
371  double ld = (*func)(query, src[i]);
372  // std::cerr << query << "-" << src[i] << " " << ld << std::endl;
373  if (ld > threshold) {
374  dst[count++] = src[i];
375  }
376  }
377  return count;
378  }
379 
380  void compact(int pick) {
381  query = pick;
382  left = compact(&v[0], &v[0], left);
383  }
384 
385  int compact_next() {
386  query = v[0];
387  left = compact(&v[0], &v[1], left - 1);
388  return query;
389  }
390 };
391 
392 #endif
393 // we implement this here in order to allow arbitrary functors without link
394 // errors
395 template <typename T>
396 RDKit::INT_VECT LeaderPicker::lazyPick(T &func, unsigned int poolSize,
397  unsigned int pickSize,
398  const RDKit::INT_VECT &firstPicks,
399  double threshold, int nthreads) const {
400  if (!poolSize) {
401  throw ValueErrorException("empty pool to pick from");
402  }
403 
404  if (poolSize < pickSize) {
405  throw ValueErrorException("pickSize cannot be larger than the poolSize");
406  }
407 
408  if (!pickSize) {
409  pickSize = poolSize;
410  }
411  RDKit::INT_VECT picks;
412 
413  nthreads = RDKit::getNumThreadsToUse(nthreads);
414 
415  LeaderPickerState<T> stat(poolSize, nthreads);
416  stat.threshold = threshold;
417  stat.func = &func;
418 
419  unsigned int picked = 0; // picks.size()
420  unsigned int pick = 0;
421 
422  if (!firstPicks.empty()) {
423  for (RDKit::INT_VECT::const_iterator pIdx = firstPicks.begin();
424  pIdx != firstPicks.end(); ++pIdx) {
425  pick = static_cast<unsigned int>(*pIdx);
426  if (pick >= poolSize) {
427  throw ValueErrorException("pick index was larger than the poolSize");
428  }
429  picks.push_back(pick);
430  stat.compact(pick);
431  picked++;
432  }
433  }
434 
435  while (picked < pickSize && !stat.empty()) {
436  pick = stat.compact_next();
437  picks.push_back(pick);
438  picked++;
439  }
440  return picks;
441 }
442 
443 template <typename T>
444 RDKit::INT_VECT LeaderPicker::lazyPick(T &func, unsigned int poolSize,
445  unsigned int pickSize) const {
446  RDKit::INT_VECT firstPicks;
447  return LeaderPicker::lazyPick(func, poolSize, pickSize, firstPicks,
449 }
450 
451 template <typename T>
452 RDKit::INT_VECT LeaderPicker::lazyPick(T &func, unsigned int poolSize,
453  unsigned int pickSize,
454  double threshold) const {
455  RDKit::INT_VECT firstPicks;
456  return LeaderPicker::lazyPick(func, poolSize, pickSize, firstPicks, threshold,
458 }
459 template <typename T>
460 RDKit::INT_VECT LeaderPicker::lazyPick(T &func, unsigned int poolSize,
461  unsigned int pickSize,
462  const RDKit::INT_VECT &firstPicks,
463  double threshold) const {
464  return LeaderPicker::lazyPick(func, poolSize, pickSize, firstPicks, threshold,
466 }
467 
468 }; // namespace RDPickers
469 
470 #endif
#define CHECK_INVARIANT(expr, mess)
Definition: Invariant.h:101
Abstract base class to do perform item picking (typically molecules) using a distance matrix.
Definition: DistPicker.h:46
Implements the Leader algorithm for picking a subset of item from a pool.
Definition: LeaderPicker.h:32
RDKit::INT_VECT lazyPick(T &func, unsigned int poolSize, unsigned int pickSize) const
Contains the implementation for a lazy Leader diversity picker.
Definition: LeaderPicker.h:444
RDKit::INT_VECT pick(const double *distMat, unsigned int poolSize, unsigned int pickSize, const RDKit::INT_VECT &firstPicks, double threshold, int nthreads) const
Contains the implementation for the Leader diversity picker.
Definition: LeaderPicker.h:98
LeaderPicker()
Default Constructor.
Definition: LeaderPicker.h:40
LeaderPicker(double threshold)
Definition: LeaderPicker.h:41
LeaderPicker(double threshold, int nthreads)
Definition: LeaderPicker.h:43
RDKit::INT_VECT pick(const double *distMat, unsigned int poolSize, unsigned int pickSize) const override
Definition: LeaderPicker.h:114
Class to allow us to throw a ValueError from C++ and have it make it back to Python.
Definition: Exceptions.h:40
RDKIT_RGROUPDECOMPOSITION_EXPORT const std::string done
std::vector< int > INT_VECT
Definition: types.h:278
unsigned int getNumThreadsToUse(int target)
Definition: RDThreads.h:37
unsigned int compact(int *dst, int *src, unsigned int len)
Definition: LeaderPicker.h:368
LeaderPickerState(unsigned int count, int)
Definition: LeaderPicker.h:358