RDKit
Open-source cheminformatics and machine learning.
HierarchicalClusterPicker.h
Go to the documentation of this file.
1 //
2 // Copyright (C) 2003-2006 Rational Discovery LLC
3 //
4 // @@ All Rights Reserved @@
5 // This file is part of the RDKit.
6 // The contents are covered by the terms of the BSD license
7 // which is included in the file license.txt, found at the root
8 // of the RDKit source tree.
9 //
10 #include <RDGeneral/export.h>
11 #ifndef _HIERARCHCLUSTERPICKER_H
12 #define _HIERARCHCLUSTERPICKER_H
13 
14 #include <RDGeneral/types.h>
15 #include "DistPicker.h"
16 
17 namespace RDPickers {
18 
19 /*! \brief Diversity picker based on hierarchical clustering
20  *
21  * This class inherits from DistPicker since it uses the distance matrix
22  * for diversity picking. The clustering itself is done using the Murtagh
23  * code in $RDBASE/Code/ML/Cluster/Mutagh/
24  */
26  public:
27  /*! \brief The type of hierarchical clustering algorithm to use
28  */
29  typedef enum {
30  WARD = 1,
31  SLINK = 2,
32  CLINK = 3,
33  UPGMA = 4,
34  MCQUITTY = 5,
35  GOWER = 6,
36  CENTROID = 7
37  } ClusterMethod;
38 
39  /*! \brief Constructor - takes a ClusterMethod as an argument
40  *
41  * Sets the hierarch clustering method
42  */
43  explicit HierarchicalClusterPicker(ClusterMethod clusterMethod)
44  : d_method(clusterMethod) {
45  ;
46  };
47 
48  /*! \brief This is the function that does the picking
49  *
50  * Here is how the algorithm works \n
51  * FIX: Supply reference
52  *
53  * - The entire pool is clustered using the distance matrix using one of the
54  * hierachical clustering method (specified via the constructor). \n
55  * - Starting with the individaul items in the pool, clusters are merged based
56  * on the output from clustering method. \n
57  * - The merging is stopped when the number of clusters is same as
58  * the number of picks.
59  * - For each item in a cluster the sum of square of the distances to the rest
60  *of
61  * of the items (in the cluster) is computed. The item with the smallest of
62  *values is
63  * picked as a representative of the cluster. Basically trying to pick the
64  *item closest
65  * to the centroid of the cluster.
66  *
67  *
68  * \param distMat - distance matrix - a vector of double. It is assumed
69  *that only the
70  * lower triangle element of the matrix are supplied in a 1D
71  *array\n
72  * NOTE: this matrix WILL BE ALTERED during the picking\n
73  * \param poolSize - the size of the pool to pick the items from. It is
74  *assumed that the
75  * distance matrix above contains the right number of elements;
76  *i.e.
77  * poolSize*(poolSize-1) \n
78  * \param pickSize - the number items to pick from pool (<= poolSize)
79  */
80  RDKit::INT_VECT pick(const double *distMat, unsigned int poolSize,
81  unsigned int pickSize) const;
82 
83  /*! \brief This is the function that does the clustering of the items - used
84  *by the picker
85  *
86  * ARGUMENTS:
87  *
88  * \param distMat - distance matrix - a vector of double. It is assumed that
89  *only the
90  * lower triangle element of the matrix are supplied in a 1D
91  *array\n
92  * NOTE: this matrix WILL BE ALTERED during the picking\n
93  * \param poolSize - the size of the pool to pick the items from. It is
94  *assumed that the
95  * distance matrix above contains the right number of elements;
96  *i.e.
97  * poolSize*(poolSize-1) \n
98  * \param pickSize - the number clusters to divide the pool into (<=
99  *poolSize)
100  */
101  RDKit::VECT_INT_VECT cluster(const double *distMat, unsigned int poolSize,
102  unsigned int pickSize) const;
103 
104  private:
105  ClusterMethod d_method;
106 };
107 }; // namespace RDPickers
108 
109 #endif
RDKit::VECT_INT_VECT
std::vector< INT_VECT > VECT_INT_VECT
Definition: types.h:268
RDKit::INT_VECT
std::vector< int > INT_VECT
Definition: types.h:254
types.h
RDKIT_SIMDIVPICKERS_EXPORT
#define RDKIT_SIMDIVPICKERS_EXPORT
Definition: export.h:619
RDPickers::HierarchicalClusterPicker
Diversity picker based on hierarchical clustering.
Definition: HierarchicalClusterPicker.h:25
RDPickers::DistPicker
Abstract base class to do perform item picking (typically molecules) using a distance matrix.
Definition: DistPicker.h:46
RDPickers
Definition: DistPicker.h:16
RDPickers::HierarchicalClusterPicker::ClusterMethod
ClusterMethod
The type of hierarchical clustering algorithm to use.
Definition: HierarchicalClusterPicker.h:29
RDPickers::HierarchicalClusterPicker::HierarchicalClusterPicker
HierarchicalClusterPicker(ClusterMethod clusterMethod)
Constructor - takes a ClusterMethod as an argument.
Definition: HierarchicalClusterPicker.h:43
DistPicker.h
export.h