RDKit
Open-source cheminformatics and machine learning.
MolSupplier.h
Go to the documentation of this file.
1 //
2 // Copyright (C) 2002-2022 greg landrum and other RDKit contributors
3 //
4 // @@ All Rights Reserved @@
5 // This file is part of the RDKit.
6 // The contents are covered by the terms of the BSD license
7 // which is included in the file license.txt, found at the root
8 // of the RDKit source tree.
9 //
10 #include <RDGeneral/export.h>
11 #ifndef RD_MOLSUPPLIER_H
12 #define RD_MOLSUPPLIER_H
13 
14 #include <RDGeneral/types.h>
15 
16 #include <string>
17 #include <string_view>
18 #include <list>
19 #include <memory>
20 #include <vector>
21 #include <iostream>
22 #include <fstream>
23 #include <GraphMol/ROMol.h>
25 
26 #ifdef RDK_BUILD_MAEPARSER_SUPPORT
27 namespace schrodinger {
28 namespace mae {
29 class Reader;
30 class Block;
31 } // namespace mae
32 } // namespace schrodinger
33 #endif // RDK_BUILD_MAEPARSER_SUPPORT
34 
35 namespace RDKit {
36 RDKIT_FILEPARSERS_EXPORT std::string strip(const std::string &orig);
37 
38 /*!
39 //
40 // Here are a couple of ways one can interact with MolSuppliers:
41 //
42 // 1) Lazy (ForwardIterator):
43 // while(!supplier.atEnd()){
44 // ROMol *mol = supplier.next();
45 // if(mol){
46 // do something;
47 // }
48 // }
49 // 2) Random Access:
50 // for(int i=0;i<supplier.length();i++){
51 // ROMol *mol = supplier[i];
52 // if(mol){
53 // do something;
54 // }
55 // }
56 //
57 //
58 */
60  // this is an abstract base class to supply molecules one at a time
61  public:
63  virtual ~MolSupplier() {}
64  virtual void init() = 0;
65  virtual void reset() = 0;
66  virtual bool atEnd() = 0;
67  virtual ROMol *next() = 0;
68 
69  virtual void close() {
70  if (df_owner) {
71  delete dp_inStream;
72  df_owner = false;
73  }
74  dp_inStream = nullptr;
75  }
76 
77  private:
78  // disable automatic copy constructors and assignment operators
79  // for this class and its subclasses. They will likely be
80  // carrying around stream pointers and copying those is a recipe
81  // for disaster.
82  MolSupplier(const MolSupplier &);
83  MolSupplier &operator=(const MolSupplier &);
84 
85  protected:
86  // stream to read the molecules from:
87  std::istream *dp_inStream = nullptr;
88  // do we own dp_inStream?
89  bool df_owner = false;
90  // opens a stream for reading and verifies that it can be read from.
91  // if not it throws an exception
92  // the caller owns the resulting stream
93  std::istream *openAndCheckStream(const std::string &filename) {
94  // FIX: this binary mode of opening file is here because of a bug in
95  // VC++ 6.0
96  // the function "tellg" does not work correctly if we do not open it this
97  // way
98  // Jan 2009: Confirmed that this is still the case in visual studio 2008
99  std::ifstream *strm =
100  new std::ifstream(filename.c_str(), std::ios_base::binary);
101  if ((!(*strm)) || strm->bad()) {
102  std::ostringstream errout;
103  errout << "Bad input file " << filename;
104  delete strm;
105  throw BadFileException(errout.str());
106  }
107 
108  strm->peek();
109  if (strm->bad() || strm->eof()) {
110  std::ostringstream errout;
111  errout << "Invalid input file " << filename;
112  delete strm;
113  throw BadFileException(errout.str());
114  }
115  return static_cast<std::istream *>(strm);
116  }
117 };
118 
119 // \brief a supplier from an SD file that only reads forward:
121  /*************************************************************************
122  * A lazy mol supplier from a SD file.
123  * - When new molecules are read using "next" their positions in the file are
124  *noted.
125  ***********************************************************************************/
126  public:
127  ForwardSDMolSupplier() { init(); }
128 
129  explicit ForwardSDMolSupplier(std::istream *inStream,
130  bool takeOwnership = true, bool sanitize = true,
131  bool removeHs = true,
132  bool strictParsing = false);
133 
134  ~ForwardSDMolSupplier() override { close(); }
135 
136  void init() override;
137  void reset() override;
138  ROMol *next() override;
139  bool atEnd() override;
140 
141  void setProcessPropertyLists(bool val) { df_processPropertyLists = val; }
142  bool getProcessPropertyLists() const { return df_processPropertyLists; }
143 
144  bool getEOFHitOnRead() const { return df_eofHitOnRead; }
145 
146  protected:
147  virtual void checkForEnd();
149  virtual void readMolProps(ROMol *);
150  bool df_end = false;
151  int d_line = 0; // line number we are currently on
152  bool df_sanitize = true, df_removeHs = true, df_strictParsing = true;
153  bool df_processPropertyLists = true;
154  bool df_eofHitOnRead = false;
155 };
156 
157 // \brief a lazy supplier from an SD file
159  /*************************************************************************
160  * A lazy mol supplier from a SD file.
161  * - When new molecules are read using "next" their positions in the file are
162  *noted.
163  * - A call to the "length" will automatically parse the entire file and
164  *cache all the mol
165  * block positions
166  * - [] operator is used to access a molecule at "idx", calling next
167  *following this will result
168  * in the next molecule after "idx"
169  ***********************************************************************************/
170 
171  public:
172  SDMolSupplier() { init(); }
173 
174  /*!
175  * \param fileName - the name of the SD file
176  * \param sanitize - if true sanitize the molecule before returning it
177  * \param removeHs - if true remove Hs from the molecule before returning it
178  * (triggers sanitization)
179  * \param strictParsing - if set to false, the parser is more lax about
180  * correctness
181  * of the contents.
182  */
183  explicit SDMolSupplier(const std::string &fileName, bool sanitize = true,
184  bool removeHs = true, bool strictParsing = true);
185 
186  explicit SDMolSupplier(std::istream *inStream, bool takeOwnership = true,
187  bool sanitize = true, bool removeHs = true,
188  bool strictParsing = true);
189 
190  ~SDMolSupplier() override { close(); }
191  void init() override;
192  void reset() override;
193  ROMol *next() override;
194  bool atEnd() override;
195  void moveTo(unsigned int idx);
196  ROMol *operator[](unsigned int idx);
197  /*! \brief returns the text block for a particular item
198  *
199  * \param idx - which item to return
200  */
201  std::string getItemText(unsigned int idx);
202  unsigned int length();
203  void setData(const std::string &text, bool sanitize = true,
204  bool removeHs = true);
205  void setData(const std::string &text, bool sanitize, bool removeHs,
206  bool strictParsing);
207 
208  /*! Resets our internal state and sets the indices of molecules in the stream.
209  * The client should be *very* careful about calling this method, as it's
210  *trivial
211  * to end up with a completely useless supplier.
212  *
213  * \param locs - the vector of stream positions.
214  *
215  * Note that this can be used not only to make reading selected molecules
216  *from a
217  * large SD file much faster, but it can also allow subsetting an SD file or
218  * rearranging the order of the molecules.
219  */
220  void setStreamIndices(const std::vector<std::streampos> &locs);
221 
222  private:
223  void checkForEnd() override;
224  void setDataCommon(const std::string &text, bool sanitize, bool removeHs);
225  int d_len = 0; // total number of mol blocks in the file (initialized to -1)
226  int d_last = 0; // the molecule we are ready to read
227  std::vector<std::streampos> d_molpos;
228 };
229 
230 //! lazy file parser for Smiles tables
232  /**************************************************************************
233  * Lazy file parser for Smiles table file, similar to the lazy SD
234  * file parser above
235  * - As an when new molecules are read using "next" their
236  * positions in the file are noted.
237  * - A call to the "length" will autamatically parse the entire
238  * file and cache all the mol block positions
239  * - [] operator is used to access a molecule at "idx", calling
240  * next following this will result in the next molecule after
241  * "idx"
242  ***************************************************************************/
243  public:
244  /*!
245  * \param fileName - the name of smiles table file
246  * \param delimiter - delimiting characters between records on a each
247  * line NOTE that this is not a string, the tokenizer looks for
248  * the individual characters in delimiter, not the full string
249  * itself. So the default delimiter: " \t", means " " or "\t".
250  * \param smilesColumn - column number for the SMILES string (defaults
251  * to the first column)
252  * \param nameColumn - column number for the molecule name (defaults to
253  * the second column) If set to -1 we assume that no name is
254  * available for the molecule and the name is defaulted to the
255  * smiles string
256  * \param titleLine - if true, the first line is assumed to list the
257  * names of properties in order separated by 'delimiter'. It is
258  * also assume that the 'SMILES' column and the 'name' column
259  * are not specified here if false - no title line is assumed
260  * and the properties are recorded as the "columnX" where "X" is
261  * the column number
262  * \param sanitize - if true sanitize the molecule before returning it
263  */
264  explicit SmilesMolSupplier(const std::string &fileName,
265  const std::string &delimiter = " \t",
266  int smilesColumn = 0, int nameColumn = 1,
267  bool titleLine = true, bool sanitize = true);
269  explicit SmilesMolSupplier(std::istream *inStream, bool takeOwnership = true,
270  const std::string &delimiter = " \t",
271  int smilesColumn = 0, int nameColumn = 1,
272  bool titleLine = true, bool sanitize = true);
273 
274  ~SmilesMolSupplier() override { close(); }
275  void setData(const std::string &text, const std::string &delimiter = " ",
276  int smilesColumn = 0, int nameColumn = 1, bool titleLine = true,
277  bool sanitize = true);
278  void init() override;
279  void reset() override;
280  ROMol *next() override;
281  bool atEnd() override;
282  void moveTo(unsigned int idx);
283  ROMol *operator[](unsigned int idx);
284  /*! \brief returns the text block for a particular item
285  *
286  * \param idx - which item to return
287  */
288  std::string getItemText(unsigned int idx);
289  unsigned int length();
290 
291  private:
292  ROMol *processLine(std::string inLine);
293  void processTitleLine();
294  std::string nextLine();
295  long int skipComments();
296  void checkForEnd();
297 
298  bool df_end = false; // have we reached the end of the file?
299  long d_len = 0; // total number of smiles in the file
300  long d_next = 0; // the molecule we are ready to read
301  size_t d_line = 0; // line number we are currently on
302  std::vector<std::streampos>
303  d_molpos; // vector of positions in the file for molecules
304  std::vector<int> d_lineNums;
305  std::string d_delim; // the delimiter string
306  bool df_sanitize = true; // sanitize molecules before returning them?
307  STR_VECT d_props; // vector of property names
308  bool df_title = true; // do we have a title line?
309  int d_smi = 0; // column id for the smile string
310  int d_name = 1; // column id for the name
311 };
312 
313 //! lazy file parser for TDT files
315  /**************************************************************************
316  * Lazy file parser for TDT files, similar to the lazy SD
317  * file parser above
318  * - As an when new molecules are read using "next" their
319  * positions in the file are noted.
320  * - A call to the "length" will autamatically parse the entire
321  * file and cache all the mol block positions
322  * - [] operator is used to access a molecule at "idx", calling
323  * next following this will result in the next molecule after
324  * "idx"
325  ***************************************************************************/
326  public:
327  /*!
328  * \param fileName - the name of the TDT file
329  * \param nameRecord - property name for the molecule name.
330  * If empty (the default), the name defaults to be empty
331  * \param confId2D - if >=0 and 2D coordinates are provided, the 2D
332  * structure (depiction) in the input will be read into the
333  * corresponding conformer id.
334  * \param confId3D - if >=0 and 3D coordinates are provided, the 3D
335  * structure (depiction) in the input will be read into the
336  * corresponding conformer id.
337  * \param sanitize - if true sanitize the molecule before returning it
338  */
339  explicit TDTMolSupplier(const std::string &fileName,
340  const std::string &nameRecord = "", int confId2D = -1,
341  int confId3D = 0, bool sanitize = true);
342  explicit TDTMolSupplier(std::istream *inStream, bool takeOwnership = true,
343  const std::string &nameRecord = "", int confId2D = -1,
344  int confId3D = 0, bool sanitize = true);
346  ~TDTMolSupplier() override { close(); }
347  void setData(const std::string &text, const std::string &nameRecord = "",
348  int confId2D = -1, int confId3D = 0, bool sanitize = true);
349  void init() override;
350  void reset() override;
351  ROMol *next() override;
352  bool atEnd() override;
353  void moveTo(unsigned int idx);
354  ROMol *operator[](unsigned int idx);
355  /*! \brief returns the text block for a particular item
356  *
357  * \param idx - which item to return
358  */
359  std::string getItemText(unsigned int idx);
360  unsigned int length();
361 
362  private:
363  bool advanceToNextRecord();
364  void checkForEnd();
365  ROMol *parseMol(std::string inLine);
366 
367  bool df_end = false; // have we reached the end of the file?
368  int d_len = 0; // total number of mols in the file
369  int d_last = 0; // the molecule we are ready to read
370  int d_line = 0; // line number we are currently on
371  int d_confId2D = -1; // id to use for 2D conformers
372  int d_confId3D = 0; // id to use for 3D conformers
373  std::vector<std::streampos>
374  d_molpos; // vector of positions in the file for molecules
375  bool df_sanitize = true; // sanitize molecules before returning them?
376  std::string d_nameProp =
377  ""; // local storage for the property providing mol names
378 };
379 
380 //! lazy file parser for PDB files
382  public:
383  explicit PDBMolSupplier(std::istream *inStream, bool takeOwnership = true,
384  bool sanitize = true, bool removeHs = true,
385  unsigned int flavor = 0,
386  bool proximityBonding = true);
387  explicit PDBMolSupplier(const std::string &fname, bool sanitize = true,
388  bool removeHs = true, unsigned int flavor = 0,
389  bool proximityBonding = true);
390 
391  ~PDBMolSupplier() override { close(); }
392 
393  void init() override;
394  void reset() override;
395  ROMol *next() override;
396  bool atEnd() override;
397 
398  protected:
399  bool df_sanitize, df_removeHs, df_proximityBonding;
400  unsigned int d_flavor;
401 };
402 #ifdef RDK_BUILD_MAEPARSER_SUPPORT
403 //! lazy file parser for MAE files
404 class RDKIT_FILEPARSERS_EXPORT MaeMolSupplier : public MolSupplier {
405  /**
406  * Due to maeparser's shared_ptr<istream> Reader interface, MaeMolSupplier
407  * always requires taking ownership of the istream ptr, as the shared ptr will
408  * always clear it upon destruction.
409  */
410 
411  public:
412  MaeMolSupplier() { init(); }
413 
414  explicit MaeMolSupplier(std::shared_ptr<std::istream> inStream,
415  bool sanitize = true, bool removeHs = true);
416 
417  explicit MaeMolSupplier(std::istream *inStream, bool takeOwnership = true,
418  bool sanitize = true, bool removeHs = true);
419 
420  explicit MaeMolSupplier(const std::string &fname, bool sanitize = true,
421  bool removeHs = true);
422 
423  ~MaeMolSupplier() override {}
424 
425  void init() override;
426  void reset() override;
427  ROMol *next() override;
428  bool atEnd() override;
429 
430  void close() override { dp_sInStream.reset(); }
431 
432  private:
433  void moveToNextBlock();
434 
435  protected:
436  bool df_sanitize, df_removeHs;
437  std::shared_ptr<schrodinger::mae::Reader> d_reader;
438  std::shared_ptr<schrodinger::mae::Block> d_next_struct;
439  std::shared_ptr<std::istream> dp_sInStream;
440  std::string d_stored_exc;
441 };
442 #endif // RDK_BUILD_MAEPARSER_SUPPORT
443 } // namespace RDKit
444 
445 #endif
Defines the primary molecule class ROMol as well as associated typedefs.
used by various file parsing classes to indicate a bad file
virtual void readMolProps(ROMol *)
void setProcessPropertyLists(bool val)
Definition: MolSupplier.h:141
ROMol * next() override
ForwardSDMolSupplier(std::istream *inStream, bool takeOwnership=true, bool sanitize=true, bool removeHs=true, bool strictParsing=false)
bool getProcessPropertyLists() const
Definition: MolSupplier.h:142
virtual bool atEnd()=0
std::istream * openAndCheckStream(const std::string &filename)
Definition: MolSupplier.h:93
virtual void reset()=0
virtual void init()=0
virtual ROMol * next()=0
virtual ~MolSupplier()
Definition: MolSupplier.h:63
virtual void close()
Definition: MolSupplier.h:69
lazy file parser for PDB files
Definition: MolSupplier.h:381
~PDBMolSupplier() override
Definition: MolSupplier.h:391
PDBMolSupplier(std::istream *inStream, bool takeOwnership=true, bool sanitize=true, bool removeHs=true, unsigned int flavor=0, bool proximityBonding=true)
bool atEnd() override
ROMol * next() override
void reset() override
void init() override
PDBMolSupplier(const std::string &fname, bool sanitize=true, bool removeHs=true, unsigned int flavor=0, bool proximityBonding=true)
unsigned int d_flavor
Definition: MolSupplier.h:400
void setStreamIndices(const std::vector< std::streampos > &locs)
void setData(const std::string &text, bool sanitize=true, bool removeHs=true)
bool atEnd() override
unsigned int length()
void reset() override
void setData(const std::string &text, bool sanitize, bool removeHs, bool strictParsing)
SDMolSupplier(const std::string &fileName, bool sanitize=true, bool removeHs=true, bool strictParsing=true)
std::string getItemText(unsigned int idx)
returns the text block for a particular item
ROMol * next() override
SDMolSupplier(std::istream *inStream, bool takeOwnership=true, bool sanitize=true, bool removeHs=true, bool strictParsing=true)
void moveTo(unsigned int idx)
void init() override
~SDMolSupplier() override
Definition: MolSupplier.h:190
ROMol * operator[](unsigned int idx)
lazy file parser for Smiles tables
Definition: MolSupplier.h:231
~SmilesMolSupplier() override
Definition: MolSupplier.h:274
void moveTo(unsigned int idx)
ROMol * next() override
SmilesMolSupplier(const std::string &fileName, const std::string &delimiter=" \t", int smilesColumn=0, int nameColumn=1, bool titleLine=true, bool sanitize=true)
void init() override
ROMol * operator[](unsigned int idx)
SmilesMolSupplier(std::istream *inStream, bool takeOwnership=true, const std::string &delimiter=" \t", int smilesColumn=0, int nameColumn=1, bool titleLine=true, bool sanitize=true)
void reset() override
bool atEnd() override
std::string getItemText(unsigned int idx)
returns the text block for a particular item
void setData(const std::string &text, const std::string &delimiter=" ", int smilesColumn=0, int nameColumn=1, bool titleLine=true, bool sanitize=true)
lazy file parser for TDT files
Definition: MolSupplier.h:314
void moveTo(unsigned int idx)
std::string getItemText(unsigned int idx)
returns the text block for a particular item
bool atEnd() override
~TDTMolSupplier() override
Definition: MolSupplier.h:346
void init() override
TDTMolSupplier(const std::string &fileName, const std::string &nameRecord="", int confId2D=-1, int confId3D=0, bool sanitize=true)
TDTMolSupplier(std::istream *inStream, bool takeOwnership=true, const std::string &nameRecord="", int confId2D=-1, int confId3D=0, bool sanitize=true)
ROMol * operator[](unsigned int idx)
ROMol * next() override
void reset() override
void setData(const std::string &text, const std::string &nameRecord="", int confId2D=-1, int confId3D=0, bool sanitize=true)
unsigned int length()
#define RDKIT_FILEPARSERS_EXPORT
Definition: export.h:161
RDKIT_GRAPHMOL_EXPORT ROMol * removeHs(const ROMol &mol, bool implicitOnly=false, bool updateExplicitCount=false, bool sanitize=true)
returns a copy of a molecule with hydrogens removed
Std stuff.
Definition: Abbreviations.h:19
RDKIT_FILEPARSERS_EXPORT std::string strip(const std::string &orig)
std::vector< std::string > STR_VECT
Definition: Dict.h:29