RDKit
Open-source cheminformatics and machine learning.
FingerprintGenerator.h
Go to the documentation of this file.
1 //
2 // Copyright (C) 2018 Boran Adas, Google Summer of Code
3 //
4 // @@ All Rights Reserved @@
5 // This file is part of the RDKit.
6 // The contents are covered by the terms of the BSD license
7 // which is included in the file license.txt, found at the root
8 // of the RDKit source tree.
9 //
10 
11 #include <RDGeneral/export.h>
12 #ifndef RD_FINGERPRINTGEN_H_2018_05
13 #define RD_FINGERPRINTGEN_H_2018_05
14 
18 #include <utility>
19 #include <vector>
20 #include <memory>
21 #include <cstdint>
22 
23 namespace RDKit {
24 class ROMol;
25 
27  using atomToBitsType = std::vector<std::vector<std::uint64_t>>;
29  std::map<std::uint64_t,
30  std::vector<std::pair<std::uint32_t, std::uint32_t>>>;
31  using bitPathsType = std::map<std::uint64_t, std::vector<std::vector<int>>>;
32  using atomCountsType = std::vector<unsigned int>;
33 
34  // numAtoms long
35  atomToBitsType *atomToBits = nullptr;
36 
37  // bitId -> vector of (atomId, radius) for morgan
38  // bitId -> (atom1, atom2) for atom pairs
39  bitInfoMapType *bitInfoMap = nullptr;
40 
41  // rdkit fp
42  // maps bitId -> vector of bond paths
43  bitPathsType *bitPaths = nullptr;
44 
45  // number of paths that set bits for each atom, must have the same size as
46  // atom count for molecule
47  atomCountsType *atomCounts = nullptr;
48 
50  atomToBitsHolder.reset(new atomToBitsType);
51  atomToBits = atomToBitsHolder.get();
52  }
54  bitInfoMapHolder.reset(new bitInfoMapType);
55  bitInfoMap = bitInfoMapHolder.get();
56  }
58  bitPathsHolder.reset(new bitPathsType);
59  bitPaths = bitPathsHolder.get();
60  }
62  atomCountsHolder.reset(new atomCountsType);
63  atomCounts = atomCountsHolder.get();
64  }
65 
66  private:
67  std::unique_ptr<atomToBitsType> atomToBitsHolder;
68  std::unique_ptr<bitInfoMapType> bitInfoMapHolder;
69  std::unique_ptr<bitPathsType> bitPathsHolder;
70  std::unique_ptr<atomCountsType> atomCountsHolder;
71 };
72 
73 /*!
74  \brief Abstract base class that holds molecule independent arguments that are
75  common amongst all fingerprint types and classes inherited from this would
76  hold fingerprint type specific arguments
77 
78  */
79 template <typename OutputType>
81  : private boost::noncopyable {
82  public:
83  FingerprintArguments(bool countSimulation,
84  const std::vector<std::uint32_t> countBounds,
85  std::uint32_t fpSize,
86  std::uint32_t numBitsPerFeature = 1);
87  const bool d_countSimulation;
88  const std::vector<std::uint32_t> d_countBounds;
89  const std::uint32_t d_fpSize;
90  const std::uint32_t d_numBitsPerFeature;
91 
92  /*!
93  \brief Returns the size of the fingerprint based on arguments
94 
95  \return OutputType size of the fingerprint
96  */
97  virtual OutputType getResultSize() const = 0;
98 
99  /**
100  \brief method that returns information string about the fingerprint specific
101  argument set and the arguments themselves
102 
103  \return std::string information string
104  */
105  virtual std::string infoString() const = 0;
106 
107  /**
108  \brief method that returns information string about common fingerprinting
109  arguments' values
110 
111  \return std::string information string
112  */
113  std::string commonArgumentsString() const;
114 
116 };
117 
118 /*!
119  \brief abstract base class that holds atom-environments that will be hashed to
120  generate the fingerprint
121 
122  */
123 template <typename OutputType>
124 class RDKIT_FINGERPRINTS_EXPORT AtomEnvironment : private boost::noncopyable {
125  public:
126  /*!
127  \brief calculates and returns the bit id to be set for this atom-environment
128 
129  \param arguments Fingerprinting type specific molecule independent
130  arguments
131  \param atomInvariants Atom-invariants to be used during hashing
132  \param bondInvariants Bond-invariants to be used during hashing
133  \param hashResults if set results will be ready to be modded
134 
135  \return OutputType calculated bit id for this environment
136  */
137  virtual OutputType getBitId(FingerprintArguments<OutputType> *arguments,
138  const std::vector<std::uint32_t> *atomInvariants,
139  const std::vector<std::uint32_t> *bondInvariants,
141  const bool hashResults = false,
142  const std::uint64_t fpSize = 0) const = 0;
143 
144  virtual ~AtomEnvironment() {}
145 };
146 
147 /*!
148  \brief abstract base class that generates atom-environments from a molecule
149 
150  */
151 template <typename OutputType>
153  : private boost::noncopyable {
154  public:
155  /*!
156  \brief generate and return all atom-envorinments from a molecule
157 
158  \param mol molecule to generate the atom-environments from
159  \param arguments fingerprint type specific molecule independent
160  arguments
161  \param fromAtoms atoms to be used during environment generation,
162  usage of this parameter depends on the implementation of different
163  fingerprint types
164  \param ignoreAtoms atoms to be ignored during environment generation,
165  usage of this parameter depends on the implementation of different
166  fingerprint types
167  \param confId which conformation to use during environment
168  generation, needed for some fingerprint types
169  \param additionalOutput contains pointers for additional outputs of
170  fingerprinting operation, usage depends on implementation of the fingerprint
171  type
172  \param atomInvariants atom invariants to be used during environment
173  generation, in some cases some of the hashing can be done during environment
174  generation so it is also passed here
175  \param bondInvariants bond invariants to be used during environment
176  generation, same as atomInvariants it might be needed
177  \param hashResults if set results will be ready to be modded
178 
179  \return std::vector<AtomEnvironment *> atom-environments generated from
180  this molecule
181  */
182  virtual std::vector<AtomEnvironment<OutputType> *> getEnvironments(
183  const ROMol &mol, FingerprintArguments<OutputType> *arguments,
184  const std::vector<std::uint32_t> *fromAtoms = nullptr,
185  const std::vector<std::uint32_t> *ignoreAtoms = nullptr,
186  const int confId = -1, const AdditionalOutput *additionalOutput = nullptr,
187  const std::vector<std::uint32_t> *atomInvariants = nullptr,
188  const std::vector<std::uint32_t> *bondInvariants = nullptr,
189  const bool hashResults = false) const = 0;
190 
191  /**
192  \brief method that returns information about this /c AtomEnvironmentGenerator
193  and its arguments if any
194 
195  \return std::string information string
196  */
197  virtual std::string infoString() const = 0;
198 
200 };
201 
202 /*!
203  \brief abstract base class for atom invariants generators
204 
205  */
207  : private boost::noncopyable {
208  public:
209  /*!
210  \brief get atom invariants from a molecule
211 
212  \param mol molecule to generate the atom invariants for
213 
214  \return std::vector<std::uint32_t> atom invariants generated for the given
215  molecule
216  */
217  virtual std::vector<std::uint32_t> *getAtomInvariants(
218  const ROMol &mol) const = 0;
219 
220  /**
221  \brief method that returns information about this /c AtomInvariantsGenerator
222  and its arguments
223 
224  \return std::string information string
225  */
226  virtual std::string infoString() const = 0;
227 
229  virtual AtomInvariantsGenerator *clone() const = 0;
230 };
231 
232 /*!
233  \brief abstract base class for bond invariants generators
234 
235  */
237  : private boost::noncopyable {
238  public:
239  /*!
240  \brief get bond invariants from a molecule
241 
242  \param mol molecule to generate the bond invariants for
243 
244  \return std::vector<std::uint32_t> bond invariants generated for the given
245  molecule
246  */
247  virtual std::vector<std::uint32_t> *getBondInvariants(
248  const ROMol &mol) const = 0;
249 
250  /**
251  \brief method that returns information about this /c BondInvariantsGenerator
252  and its arguments
253 
254  \return std::string information string
255  */
256  virtual std::string infoString() const = 0;
257 
259  virtual BondInvariantsGenerator *clone() const = 0;
260 }; // namespace RDKit
261 
262 /*!
263  \brief class that generates same fingerprint style for different output
264  formats
265 
266  */
267 template <typename OutputType>
269  : private boost::noncopyable {
270  FingerprintArguments<OutputType> *dp_fingerprintArguments;
271  AtomEnvironmentGenerator<OutputType> *dp_atomEnvironmentGenerator;
272  AtomInvariantsGenerator *dp_atomInvariantsGenerator;
273  BondInvariantsGenerator *dp_bondInvariantsGenerator;
274  const bool df_ownsAtomInvGenerator;
275  const bool df_ownsBondInvGenerator;
276 
277  SparseIntVect<OutputType> *getFingerprintHelper(
278  const ROMol &mol, const std::vector<std::uint32_t> *fromAtoms = nullptr,
279  const std::vector<std::uint32_t> *ignoreAtoms = nullptr,
280  const int confId = -1, const AdditionalOutput *additionalOutput = nullptr,
281  const std::vector<std::uint32_t> *customAtomInvariants = nullptr,
282  const std::vector<std::uint32_t> *customBondInvariants = nullptr,
283  const std::uint64_t fpSize = 0) const;
284 
285  public:
287  AtomEnvironmentGenerator<OutputType> *atomEnvironmentGenerator,
288  FingerprintArguments<OutputType> *fingerprintArguments,
289  AtomInvariantsGenerator *atomInvariantsGenerator = nullptr,
290  BondInvariantsGenerator *bondInvariantsGenerator = nullptr,
291  bool ownsAtomInvGenerator = false, bool ownsBondInvGenerator = false);
292 
294 
296  const ROMol &mol, const std::vector<std::uint32_t> *fromAtoms = nullptr,
297  const std::vector<std::uint32_t> *ignoreAtoms = nullptr,
298  const int confId = -1, const AdditionalOutput *additionalOutput = nullptr,
299  const std::vector<std::uint32_t> *customAtomInvariants = nullptr,
300  const std::vector<std::uint32_t> *customBondInvariants = nullptr) const;
301 
303  const ROMol &mol, const std::vector<std::uint32_t> *fromAtoms = nullptr,
304  const std::vector<std::uint32_t> *ignoreAtoms = nullptr,
305  const int confId = -1, const AdditionalOutput *additionalOutput = nullptr,
306  const std::vector<std::uint32_t> *customAtomInvariants = nullptr,
307  const std::vector<std::uint32_t> *customBondInvariants = nullptr) const;
308 
310  const ROMol &mol, const std::vector<std::uint32_t> *fromAtoms = nullptr,
311  const std::vector<std::uint32_t> *ignoreAtoms = nullptr,
312  const int confId = -1, const AdditionalOutput *additionalOutput = nullptr,
313  const std::vector<std::uint32_t> *customAtomInvariants = nullptr,
314  const std::vector<std::uint32_t> *customBondInvariants = nullptr) const;
315 
317  const ROMol &mol, const std::vector<std::uint32_t> *fromAtoms = nullptr,
318  const std::vector<std::uint32_t> *ignoreAtoms = nullptr,
319  const int confId = -1, const AdditionalOutput *additionalOutput = nullptr,
320  const std::vector<std::uint32_t> *customAtomInvariants = nullptr,
321  const std::vector<std::uint32_t> *customBondInvariants = nullptr) const;
322 
323  std::string infoString() const;
324 };
325 
327 
328 //! used to indicate errors for unimplemented fp types in convenience functions
330  : public std::exception {
331  public:
332  //! construct with an error message
333  UnimplementedFPException(const char *msg) : _msg(msg) {}
334  //! construct with an error message
335  UnimplementedFPException(std::string msg) : _msg(std::move(msg)) {}
336  //! get the error message
337  const char *what() const noexcept override { return _msg.c_str(); }
338  ~UnimplementedFPException() noexcept override = default;
339 
340  private:
341  std::string _msg;
342 };
343 
344 // convenience functions, fingerprint generation with default values
345 
347  const ROMol &mol, FPType fPType);
348 
350  FPType fPType);
351 
353  const ROMol &mol, FPType fPType);
354 
356  FPType fPType);
357 
358 RDKIT_FINGERPRINTS_EXPORT std::vector<SparseIntVect<std::uint64_t> *> *
359 getSparseCountFPBulk(const std::vector<const ROMol *> molVector, FPType fPType);
360 
362  const std::vector<const ROMol *> molVector, FPType fPType);
363 
364 RDKIT_FINGERPRINTS_EXPORT std::vector<SparseIntVect<std::uint32_t> *>
365  *getCountFPBulk(const std::vector<const ROMol *> molVector, FPType fPType);
366 
368  const std::vector<const ROMol *> molVector, FPType fPType);
369 
370 } // namespace RDKit
371 
372 #endif
a class for bit vectors that are densely occupied
abstract base class that generates atom-environments from a molecule
virtual std::string infoString() const =0
method that returns information about this /c AtomEnvironmentGenerator and its arguments if any
virtual std::vector< AtomEnvironment< OutputType > * > getEnvironments(const ROMol &mol, FingerprintArguments< OutputType > *arguments, const std::vector< std::uint32_t > *fromAtoms=nullptr, const std::vector< std::uint32_t > *ignoreAtoms=nullptr, const int confId=-1, const AdditionalOutput *additionalOutput=nullptr, const std::vector< std::uint32_t > *atomInvariants=nullptr, const std::vector< std::uint32_t > *bondInvariants=nullptr, const bool hashResults=false) const =0
generate and return all atom-envorinments from a molecule
abstract base class that holds atom-environments that will be hashed to generate the fingerprint
virtual OutputType getBitId(FingerprintArguments< OutputType > *arguments, const std::vector< std::uint32_t > *atomInvariants, const std::vector< std::uint32_t > *bondInvariants, const AdditionalOutput *AdditionalOutput, const bool hashResults=false, const std::uint64_t fpSize=0) const =0
calculates and returns the bit id to be set for this atom-environment
abstract base class for atom invariants generators
virtual std::string infoString() const =0
method that returns information about this /c AtomInvariantsGenerator and its arguments
virtual AtomInvariantsGenerator * clone() const =0
virtual std::vector< std::uint32_t > * getAtomInvariants(const ROMol &mol) const =0
get atom invariants from a molecule
abstract base class for bond invariants generators
virtual std::string infoString() const =0
method that returns information about this /c BondInvariantsGenerator and its arguments
virtual BondInvariantsGenerator * clone() const =0
virtual std::vector< std::uint32_t > * getBondInvariants(const ROMol &mol) const =0
get bond invariants from a molecule
Abstract base class that holds molecule independent arguments that are common amongst all fingerprint...
virtual std::string infoString() const =0
method that returns information string about the fingerprint specific argument set and the arguments ...
virtual OutputType getResultSize() const =0
Returns the size of the fingerprint based on arguments.
const std::vector< std::uint32_t > d_countBounds
FingerprintArguments(bool countSimulation, const std::vector< std::uint32_t > countBounds, std::uint32_t fpSize, std::uint32_t numBitsPerFeature=1)
std::string commonArgumentsString() const
method that returns information string about common fingerprinting arguments' values
const std::uint32_t d_numBitsPerFeature
class that generates same fingerprint style for different output formats
SparseIntVect< std::uint32_t > * getCountFingerprint(const ROMol &mol, const std::vector< std::uint32_t > *fromAtoms=nullptr, const std::vector< std::uint32_t > *ignoreAtoms=nullptr, const int confId=-1, const AdditionalOutput *additionalOutput=nullptr, const std::vector< std::uint32_t > *customAtomInvariants=nullptr, const std::vector< std::uint32_t > *customBondInvariants=nullptr) const
std::string infoString() const
SparseBitVect * getSparseFingerprint(const ROMol &mol, const std::vector< std::uint32_t > *fromAtoms=nullptr, const std::vector< std::uint32_t > *ignoreAtoms=nullptr, const int confId=-1, const AdditionalOutput *additionalOutput=nullptr, const std::vector< std::uint32_t > *customAtomInvariants=nullptr, const std::vector< std::uint32_t > *customBondInvariants=nullptr) const
FingerprintGenerator(AtomEnvironmentGenerator< OutputType > *atomEnvironmentGenerator, FingerprintArguments< OutputType > *fingerprintArguments, AtomInvariantsGenerator *atomInvariantsGenerator=nullptr, BondInvariantsGenerator *bondInvariantsGenerator=nullptr, bool ownsAtomInvGenerator=false, bool ownsBondInvGenerator=false)
SparseIntVect< OutputType > * getSparseCountFingerprint(const ROMol &mol, const std::vector< std::uint32_t > *fromAtoms=nullptr, const std::vector< std::uint32_t > *ignoreAtoms=nullptr, const int confId=-1, const AdditionalOutput *additionalOutput=nullptr, const std::vector< std::uint32_t > *customAtomInvariants=nullptr, const std::vector< std::uint32_t > *customBondInvariants=nullptr) const
ExplicitBitVect * getFingerprint(const ROMol &mol, const std::vector< std::uint32_t > *fromAtoms=nullptr, const std::vector< std::uint32_t > *ignoreAtoms=nullptr, const int confId=-1, const AdditionalOutput *additionalOutput=nullptr, const std::vector< std::uint32_t > *customAtomInvariants=nullptr, const std::vector< std::uint32_t > *customBondInvariants=nullptr) const
a class for efficiently storing sparse vectors of ints
Definition: SparseIntVect.h:28
used to indicate errors for unimplemented fp types in convenience functions
UnimplementedFPException(const char *msg)
construct with an error message
~UnimplementedFPException() noexcept override=default
UnimplementedFPException(std::string msg)
construct with an error message
const char * what() const noexcept override
get the error message
a class for bit vectors that are sparsely occupied.
Definition: SparseBitVect.h:34
#define RDKIT_FINGERPRINTS_EXPORT
Definition: export.h:177
Std stuff.
Definition: Abbreviations.h:19
RDKIT_FINGERPRINTS_EXPORT std::vector< ExplicitBitVect * > * getFPBulk(const std::vector< const ROMol * > molVector, FPType fPType)
RDKIT_FINGERPRINTS_EXPORT std::vector< SparseIntVect< std::uint64_t > * > * getSparseCountFPBulk(const std::vector< const ROMol * > molVector, FPType fPType)
RDKIT_FINGERPRINTS_EXPORT std::vector< SparseIntVect< std::uint32_t > * > * getCountFPBulk(const std::vector< const ROMol * > molVector, FPType fPType)
RDKIT_FINGERPRINTS_EXPORT SparseBitVect * getSparseFP(const ROMol &mol, FPType fPType)
RDKIT_FINGERPRINTS_EXPORT ExplicitBitVect * getFP(const ROMol &mol, FPType fPType)
RDKIT_FINGERPRINTS_EXPORT SparseIntVect< std::uint64_t > * getSparseCountFP(const ROMol &mol, FPType fPType)
RDKIT_FINGERPRINTS_EXPORT std::vector< SparseBitVect * > * getSparseFPBulk(const std::vector< const ROMol * > molVector, FPType fPType)
RDKIT_FINGERPRINTS_EXPORT SparseIntVect< std::uint32_t > * getCountFP(const ROMol &mol, FPType fPType)
std::vector< std::vector< std::uint64_t > > atomToBitsType
std::vector< unsigned int > atomCountsType
std::map< std::uint64_t, std::vector< std::vector< int > >> bitPathsType
std::map< std::uint64_t, std::vector< std::pair< std::uint32_t, std::uint32_t > >> bitInfoMapType