RDKit
Open-source cheminformatics and machine learning.
FPBReader.h
Go to the documentation of this file.
1 //
2 // Copyright (c) 2016 Greg Landrum
3 //
4 // @@ All Rights Reserved @@
5 // This file is part of the RDKit.
6 // The contents are covered by the terms of the BSD license
7 // which is included in the file license.txt, found at the root
8 // of the RDKit source tree.
9 //
10 #include <RDGeneral/export.h>
11 #ifndef RD_FPBREADER_H_DEC2015
12 #define RD_FPBREADER_H_DEC2015
13 /*! \file FPBReader.h
14 
15  \brief contains a simple class for reading and searching FPB files
16 
17  \b Note that this functionality is experimental and the API may change
18  in future releases.
19 */
20 
21 #include <iostream>
22 #include <fstream>
23 #include <sstream>
24 #include <string>
27 
28 #include <cstdint>
29 #include <boost/shared_ptr.hpp>
30 #include <boost/shared_array.hpp>
31 
32 namespace RDKit {
33 namespace detail {
34 struct FPBReader_impl;
35 }
36 
37 //! class for reading and searching FPB files
38 /*!
39  basic usage:
40  \code
41  FPBReader reader("foo.fpb");
42  reader.init();
43  boost::shared_ptr<ExplicitBitVect> ebv = reader.getFP(95);
44  std::vector<std::pair<double, unsigned int> > nbrs =
45  reader.getTanimotoNeighbors(*ebv.get(), 0.70);
46  \endcode
47 
48  \b Note: this functionality is experimental and the API may change
49  in future releases.
50 
51  <b>Note on thread safety</b>
52  Operations that involve reading from the FPB file are not thread safe.
53  This means that the \c init() method is not thread safe and none of the
54  search operations are thread safe when an \c FPBReader is initialized in
55  \c lazyRead mode.
56 
57 */
59  public:
60  FPBReader() {}
61 
62  //! ctor for reading from a named file
63  /*!
64  \param fname the name of the file to reads
65  \param lazyRead if set to \c false all fingerprints from the file will be read
66  into memory when \c init() is called.
67  */
68  FPBReader(const char *fname, bool lazyRead = false) {
69  _initFromFilename(fname, lazyRead);
70  }
71  //! \overload
72  FPBReader(const std::string &fname, bool lazyRead = false) {
73  _initFromFilename(fname.c_str(), lazyRead);
74  }
75  //! ctor for reading from an open istream
76  /*!
77  \param inStream the stream to read from
78  \param takeOwnership if set, we will take over ownership of the stream pointer
79  \param lazyRead if set to \c false all fingerprints from the file will be read
80  into memory when \c init() is called.
81 
82  Some additional notes:
83  - if \c lazyRead is set, \c inStream must support the \c seekg() and \c
84  tellg() operations.
85 
86  */
87  FPBReader(std::istream *inStream, bool takeOwnership = true,
88  bool lazyRead = false)
89  : dp_istrm(inStream),
90  dp_impl(nullptr),
91  df_owner(takeOwnership),
92  df_init(false),
93  df_lazyRead(lazyRead) {}
95  destroy();
96  if (df_owner) {
97  delete dp_istrm;
98  }
99  dp_istrm = nullptr;
100  df_init = false;
101  }
102 
103  //! Read the data from the file and initialize internal data structures
104  /*!
105  This must be called before most of the other methods of this class.
106 
107  Some notes:
108  \li if \c lazyRead is not set, all fingerprints will be read into memory. This
109  can require substantial amounts of memory for large files.
110  \li For large files, this can take a long time.
111  \li If \c lazyRead and \c takeOwnership are both \c false it is safe to close
112  and delete inStream after calling \c init()
113  */
114  void init();
115  //! cleanup
116  /*!
117  Cleans up whatever memory was allocated during init()
118  */
119  void cleanup() {
120  if (!df_init) {
121  return;
122  }
123  destroy();
124  df_init = false;
125  }
126  //! returns the requested fingerprint as an \c ExplicitBitVect
127  boost::shared_ptr<ExplicitBitVect> getFP(unsigned int idx) const;
128  //! returns the requested fingerprint as an array of bytes
129  boost::shared_array<std::uint8_t> getBytes(unsigned int idx) const;
130 
131  //! returns the id of the requested fingerprint
132  std::string getId(unsigned int idx) const;
133  //! returns the fingerprint and id of the requested fingerprint
134  std::pair<boost::shared_ptr<ExplicitBitVect>, std::string> operator[](
135  unsigned int idx) const {
136  return std::make_pair(getFP(idx), getId(idx));
137  }
138 
139  //! returns beginning and end indices of fingerprints having on-bit counts
140  //! within the range (including end points)
141  std::pair<unsigned int, unsigned int> getFPIdsInCountRange(
142  unsigned int minCount, unsigned int maxCount);
143 
144  //! returns the number of fingerprints
145  unsigned int length() const;
146  //! returns the number of bits in our fingerprints
147  unsigned int nBits() const;
148 
149  //! returns the tanimoto similarity between the specified fingerprint and the
150  //! provided fingerprint
151  double getTanimoto(unsigned int idx, const std::uint8_t *bv) const;
152  //! \overload
153  double getTanimoto(unsigned int idx,
154  boost::shared_array<std::uint8_t> bv) const {
155  return getTanimoto(idx, bv.get());
156  }
157  //! \overload
158  double getTanimoto(unsigned int idx, const ExplicitBitVect &ebv) const;
159 
160  //! returns tanimoto neighbors that are within a similarity threshold
161  /*!
162  The result vector of (similarity,index) pairs is sorted in order
163  of decreasing similarity
164 
165  \param bv the query fingerprint
166  \param threshold the minimum similarity to return
167  \param usePopcountScreen if this is true (the default) the popcount of the
168  neighbors will be used to reduce the number of calculations that need
169  to be done
170 
171  */
172  std::vector<std::pair<double, unsigned int>> getTanimotoNeighbors(
173  const std::uint8_t *bv, double threshold = 0.7,
174  bool usePopcountScreen = true) const;
175  //! \overload
176  std::vector<std::pair<double, unsigned int>> getTanimotoNeighbors(
177  boost::shared_array<std::uint8_t> bv, double threshold = 0.7,
178  bool usePopcountScreen = true) const {
179  return getTanimotoNeighbors(bv.get(), threshold, usePopcountScreen);
180  }
181  //! \overload
182  std::vector<std::pair<double, unsigned int>> getTanimotoNeighbors(
183  const ExplicitBitVect &ebv, double threshold = 0.7,
184  bool usePopcountScreen = true) const;
185 
186  //! returns the Tversky similarity between the specified fingerprint and the
187  //! provided fingerprint
188  /*!
189 
190  \param idx the fingerprint to compare to
191  \param bv the query fingerprint
192  \param ca the Tversky a coefficient
193  \param cb the Tversky a coefficient
194 
195  */
196  double getTversky(unsigned int idx, const std::uint8_t *bv, double ca,
197  double cb) const;
198  //! \overload
199  double getTversky(unsigned int idx, boost::shared_array<std::uint8_t> bv,
200  double ca, double cb) const {
201  return getTversky(idx, bv.get(), ca, cb);
202  }
203  //! \overload
204  double getTversky(unsigned int idx, const ExplicitBitVect &ebv, double ca,
205  double cb) const;
206 
207  //! returns Tversky neighbors that are within a similarity threshold
208  /*!
209  The result vector of (similarity,index) pairs is sorted in order
210  of decreasing similarity
211 
212  \param bv the query fingerprint
213  \param ca the Tversky a coefficient
214  \param cb the Tversky a coefficient
215  \param threshold the minimum similarity to return
216  \param usePopcountScreen if this is true (the default) the popcount of the
217  neighbors will be used to reduce the number of calculations that need
218  to be done
219 
220  */
221  std::vector<std::pair<double, unsigned int>> getTverskyNeighbors(
222  const std::uint8_t *bv, double ca, double cb, double threshold = 0.7,
223  bool usePopcountScreen = true) const;
224  //! \overload
225  std::vector<std::pair<double, unsigned int>> getTverskyNeighbors(
226  boost::shared_array<std::uint8_t> bv, double ca, double cb,
227  double threshold = 0.7, bool usePopcountScreen = true) const {
228  return getTverskyNeighbors(bv.get(), ca, cb, threshold, usePopcountScreen);
229  }
230  //! \overload
231  std::vector<std::pair<double, unsigned int>> getTverskyNeighbors(
232  const ExplicitBitVect &ebv, double ca, double cb, double threshold = 0.7,
233  bool usePopcountScreen = true) const;
234 
235  //! returns indices of all fingerprints that completely contain this one
236  /*! (i.e. where all the bits set in the query are also set in the db
237  molecule)
238  */
239  std::vector<unsigned int> getContainingNeighbors(
240  const std::uint8_t *bv) const;
241  //! \overload
242  std::vector<unsigned int> getContainingNeighbors(
243  boost::shared_array<std::uint8_t> bv) const {
244  return getContainingNeighbors(bv.get());
245  }
246  //! \overload
247  std::vector<unsigned int> getContainingNeighbors(
248  const ExplicitBitVect &ebv) const;
249 
250  private:
251  std::istream *dp_istrm{nullptr};
252  detail::FPBReader_impl *dp_impl{nullptr}; // implementation details
253  bool df_owner{false};
254  bool df_init{false};
255  bool df_lazyRead{false};
256 
257  // disable automatic copy constructors and assignment operators
258  // for this class and its subclasses. They will likely be
259  // carrying around stream pointers and copying those is a recipe
260  // for disaster.
261  FPBReader(const FPBReader &);
262  FPBReader &operator=(const FPBReader &);
263  void destroy();
264  void _initFromFilename(const char *fname, bool lazyRead) {
265  std::istream *tmpStream = static_cast<std::istream *>(
266  new std::ifstream(fname, std::ios_base::binary));
267  if (!(*tmpStream) || (tmpStream->bad())) {
268  std::ostringstream errout;
269  errout << "Bad input file " << fname;
270  delete tmpStream;
271  throw BadFileException(errout.str());
272  }
273  dp_istrm = tmpStream;
274  dp_impl = nullptr;
275  df_owner = true;
276  df_init = false;
277  df_lazyRead = lazyRead;
278  }
279 };
280 } // namespace RDKit
281 #endif
a class for bit vectors that are densely occupied
class for reading and searching FPB files
Definition: FPBReader.h:58
void cleanup()
cleanup
Definition: FPBReader.h:119
double getTversky(unsigned int idx, const std::uint8_t *bv, double ca, double cb) const
std::vector< std::pair< double, unsigned int > > getTanimotoNeighbors(const ExplicitBitVect &ebv, double threshold=0.7, bool usePopcountScreen=true) const
This is an overloaded member function, provided for convenience. It differs from the above function o...
std::pair< unsigned int, unsigned int > getFPIdsInCountRange(unsigned int minCount, unsigned int maxCount)
unsigned int length() const
returns the number of fingerprints
double getTanimoto(unsigned int idx, const std::uint8_t *bv) const
boost::shared_ptr< ExplicitBitVect > getFP(unsigned int idx) const
returns the requested fingerprint as an ExplicitBitVect
boost::shared_array< std::uint8_t > getBytes(unsigned int idx) const
returns the requested fingerprint as an array of bytes
double getTanimoto(unsigned int idx, boost::shared_array< std::uint8_t > bv) const
This is an overloaded member function, provided for convenience. It differs from the above function o...
Definition: FPBReader.h:153
double getTversky(unsigned int idx, boost::shared_array< std::uint8_t > bv, double ca, double cb) const
This is an overloaded member function, provided for convenience. It differs from the above function o...
Definition: FPBReader.h:199
std::vector< std::pair< double, unsigned int > > getTverskyNeighbors(const ExplicitBitVect &ebv, double ca, double cb, double threshold=0.7, bool usePopcountScreen=true) const
This is an overloaded member function, provided for convenience. It differs from the above function o...
double getTversky(unsigned int idx, const ExplicitBitVect &ebv, double ca, double cb) const
This is an overloaded member function, provided for convenience. It differs from the above function o...
FPBReader(std::istream *inStream, bool takeOwnership=true, bool lazyRead=false)
ctor for reading from an open istream
Definition: FPBReader.h:87
std::vector< unsigned int > getContainingNeighbors(const ExplicitBitVect &ebv) const
This is an overloaded member function, provided for convenience. It differs from the above function o...
FPBReader(const char *fname, bool lazyRead=false)
ctor for reading from a named file
Definition: FPBReader.h:68
FPBReader(const std::string &fname, bool lazyRead=false)
This is an overloaded member function, provided for convenience. It differs from the above function o...
Definition: FPBReader.h:72
std::vector< unsigned int > getContainingNeighbors(boost::shared_array< std::uint8_t > bv) const
This is an overloaded member function, provided for convenience. It differs from the above function o...
Definition: FPBReader.h:242
std::vector< std::pair< double, unsigned int > > getTverskyNeighbors(const std::uint8_t *bv, double ca, double cb, double threshold=0.7, bool usePopcountScreen=true) const
returns Tversky neighbors that are within a similarity threshold
unsigned int nBits() const
returns the number of bits in our fingerprints
std::vector< unsigned int > getContainingNeighbors(const std::uint8_t *bv) const
returns indices of all fingerprints that completely contain this one
double getTanimoto(unsigned int idx, const ExplicitBitVect &ebv) const
This is an overloaded member function, provided for convenience. It differs from the above function o...
std::string getId(unsigned int idx) const
returns the id of the requested fingerprint
std::pair< boost::shared_ptr< ExplicitBitVect >, std::string > operator[](unsigned int idx) const
returns the fingerprint and id of the requested fingerprint
Definition: FPBReader.h:134
std::vector< std::pair< double, unsigned int > > getTanimotoNeighbors(boost::shared_array< std::uint8_t > bv, double threshold=0.7, bool usePopcountScreen=true) const
This is an overloaded member function, provided for convenience. It differs from the above function o...
Definition: FPBReader.h:176
void init()
Read the data from the file and initialize internal data structures.
std::vector< std::pair< double, unsigned int > > getTanimotoNeighbors(const std::uint8_t *bv, double threshold=0.7, bool usePopcountScreen=true) const
returns tanimoto neighbors that are within a similarity threshold
std::vector< std::pair< double, unsigned int > > getTverskyNeighbors(boost::shared_array< std::uint8_t > bv, double ca, double cb, double threshold=0.7, bool usePopcountScreen=true) const
This is an overloaded member function, provided for convenience. It differs from the above function o...
Definition: FPBReader.h:225
#define RDKIT_DATASTRUCTS_EXPORT
Definition: export.h:81
Std stuff.
Definition: Abbreviations.h:19
RDKIT_FINGERPRINTS_EXPORT ExplicitBitVect * getFP(const ROMol &mol, FPType fPType)