RDKit
Open-source cheminformatics and machine learning.
FileParsers.h
Go to the documentation of this file.
1 //
2 // Copyright (C) 2002-2022 Greg Landrum and other RDKit contributors
3 //
4 // @@ All Rights Reserved @@
5 // This file is part of the RDKit.
6 // The contents are covered by the terms of the BSD license
7 // which is included in the file license.txt, found at the root
8 // of the RDKit source tree.
9 //
10 #include <RDGeneral/export.h>
11 #ifndef RD_FILEPARSERS_H
12 #define RD_FILEPARSERS_H
13 
14 #include <RDGeneral/types.h>
15 #include <GraphMol/RDKitBase.h>
16 #include "CDXMLParser.h"
17 #include <string>
18 #include <string_view>
19 #include <iostream>
20 #include <vector>
21 #include <exception>
22 
23 #include <boost/shared_ptr.hpp>
24 
25 namespace RDKit {
26 const int MOLFILE_MAXLINE = 256;
27 RDKIT_FILEPARSERS_EXPORT std::string strip(const std::string &orig);
28 
30  : public std::exception {
31  public:
32  //! construct with an error message
33  explicit MolFileUnhandledFeatureException(const char *msg) : _msg(msg) {}
34  //! construct with an error message
35  explicit MolFileUnhandledFeatureException(const std::string msg)
36  : _msg(msg) {}
37  //! get the error message
38  const char *what() const noexcept override { return _msg.c_str(); }
39  ~MolFileUnhandledFeatureException() noexcept override = default;
40 
41  private:
42  std::string _msg;
43 };
44 
45 //-----
46 // mol files
47 //-----
48 typedef std::vector<RWMOL_SPTR> RWMOL_SPTR_VECT;
49 // \brief construct a molecule from MDL mol data in a stream
50 /*!
51  * \param inStream - stream containing the data
52  * \param line - current line number (used for error reporting)
53  * \param sanitize - toggles sanitization and stereochemistry
54  * perception of the molecule
55  * \param removeHs - toggles removal of Hs from the molecule. H removal
56  * is only done if the molecule is sanitized
57  * \param line - current line number (used for error reporting)
58  * \param strictParsing - if set to false, the parser is more lax about
59  * correctness of the contents.
60  *
61  */
63  unsigned int &line,
64  bool sanitize = true,
65  bool removeHs = true,
66  bool strictParsing = true);
67 // \overload
69  unsigned int &line,
70  bool sanitize = true,
71  bool removeHs = true,
72  bool strictParsing = true);
73 // \brief construct a molecule from an MDL mol block
74 /*!
75  * \param molBlock - string containing the mol block
76  * \param sanitize - toggles sanitization and stereochemistry
77  * perception of the molecule
78  * \param removeHs - toggles removal of Hs from the molecule. H removal
79  * is only done if the molecule is sanitized
80  * \param strictParsing - if set to false, the parser is more lax about
81  * correctness of the contents.
82  */
83 RDKIT_FILEPARSERS_EXPORT RWMol *MolBlockToMol(const std::string &molBlock,
84  bool sanitize = true,
85  bool removeHs = true,
86  bool strictParsing = true);
87 
88 // \brief construct a molecule from an MDL mol file
89 /*!
90  * \param fName - string containing the file name
91  * \param sanitize - toggles sanitization and stereochemistry
92  * perception of the molecule
93  * \param removeHs - toggles removal of Hs from the molecule. H removal
94  * is only done if the molecule is sanitized
95  * \param strictParsing - if set to false, the parser is more lax about
96  * correctness of the contents.
97  */
98 RDKIT_FILEPARSERS_EXPORT RWMol *MolFileToMol(const std::string &fName,
99  bool sanitize = true,
100  bool removeHs = true,
101  bool strictParsing = true);
102 
103 // \brief generates an MDL mol block for a molecule
104 /*!
105  * \param mol - the molecule in question
106  * \param includeStereo - toggles inclusion of stereochemistry information
107  * \param confId - selects the conformer to be used
108  * \param kekulize - triggers kekulization of the molecule before it is
109  * written
110  * \param forceV3000 - force generation a V3000 mol block (happens
111  * automatically with
112  * more than 999 atoms or bonds)
113  */
115  bool includeStereo = true,
116  int confId = -1,
117  bool kekulize = true,
118  bool forceV3000 = false);
119 
120 // \brief generates an MDL v3000 mol block for a molecule
121 /*!
122  * \param mol - the molecule in question
123  * \param includeStereo - toggles inclusion of stereochemistry information
124  * \param confId - selects the conformer to be used
125  * \param kekulize - triggers kekulization of the molecule before it is
126  * written
127  */
128 inline std::string MolToV3KMolBlock(const ROMol &mol, bool includeStereo = true,
129  int confId = -1, bool kekulize = true) {
130  return MolToMolBlock(mol, includeStereo, confId, kekulize, true);
131 }
132 
133 // \brief Writes a molecule to an MDL mol file
134 /*!
135  * \param mol - the molecule in question
136  * \param fName - the name of the file to use
137  * \param includeStereo - toggles inclusion of stereochemistry information
138  * \param confId - selects the conformer to be used
139  * \param kekulize - triggers kekulization of the molecule before it is
140  * written
141  * \param forceV3000 - force generation a V3000 mol block (happens
142  * automatically with
143  * more than 999 atoms or bonds)
144  */
146  const ROMol &mol, const std::string &fName, bool includeStereo = true,
147  int confId = -1, bool kekulize = true, bool forceV3000 = false);
148 
149 // \brief Writes a molecule to an MDL V3000 mol file
150 /*!
151  * \param mol - the molecule in question
152  * \param fName - the name of the file to use
153  * \param includeStereo - toggles inclusion of stereochemistry information
154  * \param confId - selects the conformer to be used
155  * \param kekulize - triggers kekulization of the molecule before it is
156  * written
157  */
158 inline void MolToV3KMolFile(const ROMol &mol, const std::string &fName,
159  bool includeStereo = true, int confId = -1,
160  bool kekulize = true) {
161  MolToMolFile(mol, fName, includeStereo, confId, kekulize, true);
162 }
163 
165  int confId = -1,
166  bool kekulize = true);
167 
169  const std::string &fName,
170  int confId = -1,
171  bool kekulize = true);
172 
174  int confId = -1);
175 
177  const std::string &fName,
178  int confId = -1);
179 
180 //-----
181 // TPL handling:
182 //-----
183 
184 //! \brief translate TPL data (BioCad format) into a multi-conf molecule
185 /*!
186  \param inStream: the stream from which to read
187  \param line: used to track the line number of errors
188  \param sanitize: toggles sanitization and stereochemistry
189  perception of the molecule
190  \param skipFirstConf: according to the TPL format description, the atomic
191  coords in the atom-information block describe the first
192  conformation and the first conf block describes second
193  conformation. The CombiCode, on the other hand, writes
194  the first conformation data both to the atom-information
195  block and to the first conf block. We want to be able to
196  read CombiCode-style tpls, so we'll allow this
197  mis-feature
198  to be parsed when this flag is set.
199 */
201  unsigned int &line,
202  bool sanitize = true,
203  bool skipFirstConf = false);
204 
205 //! \brief construct a multi-conf molecule from a TPL (BioCad format) file
206 /*!
207  \param fName: the name of the file from which to read
208  \param sanitize: toggles sanitization and stereochemistry
209  perception of the molecule
210  \param skipFirstConf: according to the TPL format description, the atomic
211  coords in the atom-information block describe the first
212  conformation and the first conf block describes second
213  conformation. The CombiCode, on the other hand, writes
214  the first conformation data both to the atom-information
215  block and to the first conf block. We want to be able to
216  read CombiCode-style tpls, so we'll allow this
217  mis-feature
218  to be parsed when this flag is set.
219 */
220 RDKIT_FILEPARSERS_EXPORT RWMol *TPLFileToMol(const std::string &fName,
221  bool sanitize = true,
222  bool skipFirstConf = false);
223 
225  const ROMol &mol, const std::string &partialChargeProp = "_GasteigerCharge",
226  bool writeFirstConfTwice = false);
228  const ROMol &mol, const std::string &fName,
229  const std::string &partialChargeProp = "_GasteigerCharge",
230  bool writeFirstConfTwice = false);
231 
232 //-----
233 // MOL2 handling
234 //-----
235 
236 typedef enum {
237  CORINA = 0 //!< supports output from Corina and some dbtranslate output
239 
240 // \brief construct a molecule from a Tripos mol2 file
241 /*!
242  *
243  * \param fName - string containing the file name
244  * \param sanitize - toggles sanitization of the molecule
245  * \param removeHs - toggles removal of Hs from the molecule. H removal
246  * is only done if the molecule is sanitized
247  * \param variant - the atom type definitions to use
248  * \param cleanupSubstructures - toggles recognition and cleanup of common
249  * substructures
250  */
251 RDKIT_FILEPARSERS_EXPORT RWMol *Mol2FileToMol(const std::string &fName,
252  bool sanitize = true,
253  bool removeHs = true,
254  Mol2Type variant = CORINA,
255  bool cleanupSubstructures = true);
256 
257 // \brief construct a molecule from Tripos mol2 data in a stream
258 /*!
259  * \param inStream - stream containing the data
260  * \param sanitize - toggles sanitization of the molecule
261  * \param removeHs - toggles removal of Hs from the molecule. H removal
262  * is only done if the molecule is sanitized
263  * \param variant - the atom type definitions to use
264  * \param cleanupSubstructures - toggles recognition and cleanup of common
265  * substructures
266  */
268  std::istream *inStream, bool sanitize = true, bool removeHs = true,
269  Mol2Type variant = CORINA, bool cleanupSubstructures = true);
270 // \overload
272  std::istream &inStream, bool sanitize = true, bool removeHs = true,
273  Mol2Type variant = CORINA, bool cleanupSubstructures = true);
274 
275 // \brief construct a molecule from a Tripos mol2 block
276 /*!
277  * \param molBlock - string containing the mol block
278  * \param sanitize - toggles sanitization of the molecule
279  * \param removeHs - toggles removal of Hs from the molecule. H removal
280  * is only done if the molecule is sanitized
281  * \param variant - the atom type definitions to use
282  * \param cleanupSubstructures - toggles recognition and cleanup of common
283  * substructures
284  */
286  const std::string &molBlock, bool sanitize = true, bool removeHs = true,
287  Mol2Type variant = CORINA, bool cleanupSubstructures = true);
288 
290 // \brief construct a molecule from an xyz block
291 /*!
292  * \param xyzBlock - string containing the xyz block
293  */
294 RDKIT_FILEPARSERS_EXPORT RWMol *XYZBlockToMol(const std::string &xyzBlock);
295 // \brief construct a molecule from an xyz file
296 /*!
297  * \param fName - string containing the file name
298  */
299 RDKIT_FILEPARSERS_EXPORT RWMol *XYZFileToMol(const std::string &fName);
300 
302  bool sanitize = true,
303  bool removeHs = true,
304  unsigned int flavor = 0,
305  bool proximityBonding = true);
306 
308  bool sanitize = true,
309  bool removeHs = true,
310  unsigned int flavor = 0,
311  bool proximityBonding = true);
313  std::istream *inStream, bool sanitize = true, bool removeHs = true,
314  unsigned int flavor = 0, bool proximityBonding = true);
316  std::istream &inStream, bool sanitize = true, bool removeHs = true,
317  unsigned int flavor = 0, bool proximityBonding = true);
318 RDKIT_FILEPARSERS_EXPORT RWMol *PDBFileToMol(const std::string &fname,
319  bool sanitize = true,
320  bool removeHs = true,
321  unsigned int flavor = 0,
322  bool proximityBonding = true);
323 
324 // \brief generates an PDB block for a molecule
325 /*!
326  * \param mol - the molecule in question
327  * \param confId - selects the conformer to be used
328  * \param flavor - controls what gets written:
329  * flavor & 1 : Write MODEL/ENDMDL lines around each record
330  * flavor & 2 : Don't write single CONECT records
331  * flavor & 4 : Write CONECT records in both directions
332  * flavor & 8 : Don't use multiple CONECTs to encode bond order
333  * flavor & 16 : Write MASTER record
334  * flavor & 32 : Write TER record
335  */
337  int confId = -1,
338  unsigned int flavor = 0);
339 // \brief Writes a molecule to an MDL mol file
340 /*!
341  * \param mol - the molecule in question
342  * \param fName - the name of the file to use
343  * \param confId - selects the conformer to be used
344  * \param flavor - controls what gets written:
345  * flavor & 1 : Write MODEL/ENDMDL lines around each record
346  * flavor & 2 : Don't write single CONECT records
347  * flavor & 4 : Write CONECT records in both directions
348  * flavor & 8 : Don't use multiple CONECTs to encode bond order
349  * flavor & 16 : Write MASTER record
350  * flavor & 32 : Write TER record
351  */
353  const std::string &fname,
354  int confId = -1,
355  unsigned int flavor = 0);
356 
357 // \brief reads a molecule from the metadata in an RDKit-generated SVG file
358 /*!
359  * \param svg - string containing the SVG
360  * \param sanitize - toggles sanitization of the molecule
361  * \param removeHs - toggles removal of Hs from the molecule. H removal
362  * is only done if the molecule is sanitized
363  *
364  * **NOTE** This functionality should be considered beta.
365  */
367  bool sanitize = true,
368  bool removeHs = true);
369 /*! \overload
370  */
372  bool sanitize = true,
373  bool removeHs = true);
374 
375 inline std::unique_ptr<RDKit::RWMol> operator"" _ctab(const char *text,
376  size_t len) {
377  std::string data(text, len);
378  RWMol *ptr = nullptr;
379  try {
380  ptr = MolBlockToMol(data);
381  } catch (const RDKit::MolSanitizeException &) {
382  ptr = nullptr;
383  }
384  return std::unique_ptr<RWMol>(ptr);
385 }
386 inline std::unique_ptr<RDKit::RWMol> operator"" _mol2(const char *text,
387  size_t len) {
388  std::string data(text, len);
389  RWMol *ptr = nullptr;
390  try {
391  ptr = Mol2BlockToMol(data);
392  } catch (const RDKit::MolSanitizeException &) {
393  ptr = nullptr;
394  }
395  return std::unique_ptr<RWMol>(ptr);
396 }
397 
398 inline std::unique_ptr<RDKit::RWMol> operator"" _pdb(const char *text,
399  size_t len) {
400  std::string data(text, len);
401  RWMol *ptr = nullptr;
402  try {
403  ptr = PDBBlockToMol(data);
404  } catch (const RDKit::MolSanitizeException &) {
405  ptr = nullptr;
406  }
407  return std::unique_ptr<RWMol>(ptr);
408 }
409 
410 } // namespace RDKit
411 
412 #endif
pulls in the core RDKit functionality
MolFileUnhandledFeatureException(const char *msg)
construct with an error message
Definition: FileParsers.h:33
MolFileUnhandledFeatureException(const std::string msg)
construct with an error message
Definition: FileParsers.h:35
~MolFileUnhandledFeatureException() noexcept override=default
const char * what() const noexcept override
get the error message
Definition: FileParsers.h:38
class for flagging sanitization errors
RWMol is a molecule class that is intended to be edited.
Definition: RWMol.h:32
#define RDKIT_FILEPARSERS_EXPORT
Definition: export.h:161
RDKIT_GRAPHMOL_EXPORT ROMol * removeHs(const ROMol &mol, bool implicitOnly=false, bool updateExplicitCount=false, bool sanitize=true)
returns a copy of a molecule with hydrogens removed
Std stuff.
Definition: Abbreviations.h:19
std::string MolToV3KMolBlock(const ROMol &mol, bool includeStereo=true, int confId=-1, bool kekulize=true)
Definition: FileParsers.h:128
RDKIT_FILEPARSERS_EXPORT std::string strip(const std::string &orig)
RDKIT_FILEPARSERS_EXPORT void MolToMolFile(const ROMol &mol, const std::string &fName, bool includeStereo=true, int confId=-1, bool kekulize=true, bool forceV3000=false)
RDKIT_FILEPARSERS_EXPORT std::string MolToPDBBlock(const ROMol &mol, int confId=-1, unsigned int flavor=0)
RDKIT_FILEPARSERS_EXPORT RWMol * XYZFileToMol(const std::string &fName)
RDKIT_FILEPARSERS_EXPORT RWMol * MolBlockToMol(const std::string &molBlock, bool sanitize=true, bool removeHs=true, bool strictParsing=true)
RDKIT_FILEPARSERS_EXPORT std::string MolToXYZBlock(const ROMol &mol, int confId=-1)
RDKIT_FILEPARSERS_EXPORT void MolToXYZFile(const ROMol &mol, const std::string &fName, int confId=-1)
RDKIT_FILEPARSERS_EXPORT std::string MolToTPLText(const ROMol &mol, const std::string &partialChargeProp="_GasteigerCharge", bool writeFirstConfTwice=false)
RDKIT_FILEPARSERS_EXPORT void MolToPDBFile(const ROMol &mol, const std::string &fname, int confId=-1, unsigned int flavor=0)
RDKIT_FILEPARSERS_EXPORT void MolToCMLFile(const ROMol &mol, const std::string &fName, int confId=-1, bool kekulize=true)
RDKIT_FILEPARSERS_EXPORT RWMol * PDBFileToMol(const std::string &fname, bool sanitize=true, bool removeHs=true, unsigned int flavor=0, bool proximityBonding=true)
void MolToV3KMolFile(const ROMol &mol, const std::string &fName, bool includeStereo=true, int confId=-1, bool kekulize=true)
Definition: FileParsers.h:158
RDKIT_FILEPARSERS_EXPORT RWMol * XYZBlockToMol(const std::string &xyzBlock)
RDKIT_FILEPARSERS_EXPORT RWMol * TPLDataStreamToMol(std::istream *inStream, unsigned int &line, bool sanitize=true, bool skipFirstConf=false)
translate TPL data (BioCad format) into a multi-conf molecule
RDKIT_FILEPARSERS_EXPORT std::string MolToMolBlock(const ROMol &mol, bool includeStereo=true, int confId=-1, bool kekulize=true, bool forceV3000=false)
RDKIT_FILEPARSERS_EXPORT std::string MolToCMLBlock(const ROMol &mol, int confId=-1, bool kekulize=true)
RDKIT_FILEPARSERS_EXPORT RWMol * PDBDataStreamToMol(std::istream *inStream, bool sanitize=true, bool removeHs=true, unsigned int flavor=0, bool proximityBonding=true)
@ CORINA
supports output from Corina and some dbtranslate output
Definition: FileParsers.h:237
RDKIT_FILEPARSERS_EXPORT RWMol * Mol2FileToMol(const std::string &fName, bool sanitize=true, bool removeHs=true, Mol2Type variant=CORINA, bool cleanupSubstructures=true)
RDKIT_FILEPARSERS_EXPORT RWMol * RDKitSVGToMol(const std::string &svg, bool sanitize=true, bool removeHs=true)
RDKIT_FILEPARSERS_EXPORT void MolToTPLFile(const ROMol &mol, const std::string &fName, const std::string &partialChargeProp="_GasteigerCharge", bool writeFirstConfTwice=false)
RDKIT_FILEPARSERS_EXPORT RWMol * Mol2DataStreamToMol(std::istream *inStream, bool sanitize=true, bool removeHs=true, Mol2Type variant=CORINA, bool cleanupSubstructures=true)
RDKIT_FILEPARSERS_EXPORT RWMol * PDBBlockToMol(const char *str, bool sanitize=true, bool removeHs=true, unsigned int flavor=0, bool proximityBonding=true)
RDKIT_FILEPARSERS_EXPORT RWMol * Mol2BlockToMol(const std::string &molBlock, bool sanitize=true, bool removeHs=true, Mol2Type variant=CORINA, bool cleanupSubstructures=true)
RDKIT_FILEPARSERS_EXPORT RWMol * TPLFileToMol(const std::string &fName, bool sanitize=true, bool skipFirstConf=false)
construct a multi-conf molecule from a TPL (BioCad format) file
RDKIT_FILEPARSERS_EXPORT RWMol * MolDataStreamToMol(std::istream *inStream, unsigned int &line, bool sanitize=true, bool removeHs=true, bool strictParsing=true)
RDKIT_FILEPARSERS_EXPORT RWMol * XYZDataStreamToMol(std::istream &inStream)
RDKIT_FILEPARSERS_EXPORT RWMol * MolFileToMol(const std::string &fName, bool sanitize=true, bool removeHs=true, bool strictParsing=true)
const int MOLFILE_MAXLINE
Definition: FileParsers.h:26
std::vector< RWMOL_SPTR > RWMOL_SPTR_VECT
Definition: FileParsers.h:48
boost::shared_ptr< RWMol > RWMOL_SPTR
Definition: RWMol.h:217