RDKit
Open-source cheminformatics and machine learning.
FileParserUtils.h
Go to the documentation of this file.
1 //
2 // Copyright (C) 2010-2019 Greg Landrum and Rational Discovery LLC
3 //
4 // @@ All Rights Reserved @@
5 // This file is part of the RDKit.
6 // The contents are covered by the terms of the BSD license
7 // which is included in the file license.txt, found at the root
8 // of the RDKit source tree.
9 //
10 #include <RDGeneral/export.h>
11 #ifndef RD_FILEPARSERUTILS_H
12 #define RD_FILEPARSERUTILS_H
13 
14 #include <string>
15 #include <iostream>
17 #include <boost/lexical_cast.hpp>
18 #include <boost/algorithm/string.hpp>
19 #include <boost/format.hpp>
21 
22 namespace RDKit {
23 class RWMol;
24 class Conformer;
25 
26 namespace FileParserUtils {
27 template <typename T>
28 T stripSpacesAndCast(const std::string &input, bool acceptSpaces = false) {
29  std::string trimmed = boost::trim_copy(input);
30  if (acceptSpaces && trimmed == "") {
31  return 0;
32  } else {
33  return boost::lexical_cast<T>(trimmed);
34  }
35 }
36 RDKIT_FILEPARSERS_EXPORT int toInt(const std::string &input,
37  bool acceptSpaces = false);
38 RDKIT_FILEPARSERS_EXPORT double toDouble(const std::string &input,
39  bool acceptSpaces = true);
40 
41 // reads a line from an MDL v3K CTAB
42 RDKIT_FILEPARSERS_EXPORT std::string getV3000Line(std::istream *inStream,
43  unsigned int &line);
44 
45 // nAtoms and nBonds are ignored on input, set on output
47  std::istream *inStream, unsigned int &line, RWMol *mol, Conformer *&conf,
48  bool &chiralityPossible, unsigned int &nAtoms, unsigned int &nBonds,
49  bool strictParsing = true, bool expectMEND = true);
50 
51 // nAtoms and nBonds are used
53  std::istream *inStream, unsigned int &line, RWMol *mol, Conformer *&conf,
54  bool &chiralityPossible, unsigned int &nAtoms, unsigned int &nBonds,
55  bool strictParsing = true);
56 
57 //! finishes up the processing (sanitization, etc.) of a molecule read from CTAB
58 RDKIT_FILEPARSERS_EXPORT void finishMolProcessing(RWMol *res, bool chiralityPossible, bool sanitize,
59  bool removeHs);
60 
62 
63 //! applies a particular property to the atoms as an atom property list
64 template <typename T>
65 void applyMolListPropToAtoms(ROMol &mol, const std::string &pn,
66  const std::string &prefix,
67  const std::string &missingValueMarker = "n/a") {
68  std::string atompn = pn.substr(prefix.size());
69  std::string strVect = mol.getProp<std::string>(pn);
70  std::vector<std::string> tokens;
71  boost::split(tokens, strVect, boost::is_any_of(" \t\n"),
72  boost::token_compress_on);
73  if (tokens.size() < mol.getNumAtoms()) {
75  << "Property list " << pn << " too short, only " << tokens.size()
76  << " elements found. Ignoring it." << std::endl;
77  return;
78  }
79  std::string mv = missingValueMarker;
80  size_t first_token = 0;
81  if (tokens.size() == mol.getNumAtoms() + 1 && tokens[0].front() == '[' &&
82  tokens[0].back() == ']') {
83  mv = std::string(tokens[0].begin() + 1, tokens[0].end() - 1);
84  first_token = 1;
85  }
86  if (mv.empty()) {
87  BOOST_LOG(rdWarningLog) << "Missing value marker for property " << pn
88  << " is empty." << std::endl;
89  }
90  for (size_t i = first_token; i < tokens.size(); ++i) {
91  if (tokens[i] != mv) {
92  unsigned int atomid = i - first_token;
93  try {
94  T apv = boost::lexical_cast<T>(tokens[i]);
95  mol.getAtomWithIdx(atomid)->setProp(atompn, apv);
96  } catch (const boost::bad_lexical_cast &) {
98  << "Value " << tokens[i] << " for property " << pn << " of atom "
99  << atomid << " can not be parsed. Ignoring it." << std::endl;
100  }
101  }
102  }
103 }
104 
105 //! applies all properties matching a particular prefix as an atom property list
106 template <typename T>
107 void applyMolListPropsToAtoms(ROMol &mol, const std::string &prefix,
108  const std::string missingValueMarker = "n/a") {
109  for (auto pn : mol.getPropList()) {
110  if (pn.find(prefix) == 0 && pn.length() > prefix.length()) {
111  applyMolListPropToAtoms<T>(mol, pn, prefix, missingValueMarker);
112  }
113  }
114 }
115 static const std::string atomPropPrefix = "atom.";
116 //! if the property name matches our rules for atom property lists, we'll apply
117 //! it to the atoms
119  ROMol &mol, const std::string pn,
120  const std::string &missingValueMarker = "n/a") {
121  if (pn.find(atomPropPrefix) == 0 && pn.length() > atomPropPrefix.length()) {
122  std::string prefix = atomPropPrefix + "prop.";
123  if (pn.find(prefix) == 0 && pn.length() > prefix.length()) {
124  applyMolListPropToAtoms<std::string>(mol, pn, prefix, missingValueMarker);
125  } else {
126  prefix = atomPropPrefix + "iprop.";
127  if (pn.find(prefix) == 0 && pn.length() > prefix.length()) {
128  applyMolListPropToAtoms<std::int64_t>(mol, pn, prefix,
129  missingValueMarker);
130  } else {
131  prefix = atomPropPrefix + "dprop.";
132  if (pn.find(prefix) == 0 && pn.length() > prefix.length()) {
133  applyMolListPropToAtoms<double>(mol, pn, prefix, missingValueMarker);
134  } else {
135  prefix = atomPropPrefix + "bprop.";
136  if (pn.find(prefix) == 0 && pn.length() > prefix.length()) {
137  applyMolListPropToAtoms<bool>(mol, pn, prefix, missingValueMarker);
138  }
139  }
140  }
141  }
142  }
143 }
144 //! loops over all properties and applies the ones that match the rules for atom
145 //! property lists to the atoms
147  ROMol &mol, const std::string &missingValueMarker = "n/a") {
148  for (auto pn : mol.getPropList()) {
149  processMolPropertyList(mol, pn, missingValueMarker);
150  }
151 }
152 template <typename T>
153 std::string getAtomPropertyList(ROMol &mol, const std::string &atomPropName,
154  std::string missingValueMarker = "",
155  unsigned int lineSize = 190) {
156  std::string res;
157  std::string propVal;
158  if (!missingValueMarker.empty()) {
159  propVal += boost::str(boost::format("[%s] ") % missingValueMarker);
160  } else {
161  missingValueMarker = "n/a";
162  }
163  for (const auto &atom : mol.atoms()) {
164  std::string apVal = missingValueMarker;
165  if (atom->hasProp(atomPropName)) {
166  T tVal = atom->getProp<T>(atomPropName);
167  apVal = boost::lexical_cast<std::string>(tVal);
168  // seems like this should work, but it doesn't:
169  // atom->getProp(atomPropName,apVal);
170  }
171  if (propVal.length() + apVal.length() + 1 >= lineSize) {
172  // remove trailing space:
173  propVal.pop_back();
174  res += propVal + "\n";
175  propVal = "";
176  }
177  propVal += apVal + " ";
178  }
179  if (!propVal.empty()) {
180  // remove the trailing space:
181  propVal.pop_back();
182  res += propVal;
183  }
184  return res;
185 }
187  ROMol &mol, const std::string &atomPropName,
188  const std::string &missingValueMarker = "", unsigned int lineSize = 190) {
189  std::string molPropName = "atom.iprop." + atomPropName;
190  mol.setProp(molPropName,
191  getAtomPropertyList<boost::int64_t>(
192  mol, atomPropName, missingValueMarker, lineSize));
193 }
195  ROMol &mol, const std::string &atomPropName,
196  const std::string &missingValueMarker = "", unsigned int lineSize = 190) {
197  std::string molPropName = "atom.dprop." + atomPropName;
198  mol.setProp(molPropName,
199  getAtomPropertyList<double>(mol, atomPropName, missingValueMarker,
200  lineSize));
201 }
203  ROMol &mol, const std::string &atomPropName,
204  const std::string &missingValueMarker = "", unsigned int lineSize = 190) {
205  std::string molPropName = "atom.bprop." + atomPropName;
206  mol.setProp(molPropName,
207  getAtomPropertyList<bool>(mol, atomPropName, missingValueMarker,
208  lineSize));
209 }
211  ROMol &mol, const std::string &atomPropName,
212  const std::string &missingValueMarker = "", unsigned int lineSize = 190) {
213  std::string molPropName = "atom.prop." + atomPropName;
214  mol.setProp(molPropName,
215  getAtomPropertyList<std::string>(mol, atomPropName,
216  missingValueMarker, lineSize));
217 }
218 
219 } // namespace FileParserUtils
220 } // namespace RDKit
221 
222 #endif
BOOST_LOG
#define BOOST_LOG(__arg__)
Definition: RDLog.h:88
RDKit::FileParserUtils::applyMolListPropsToAtoms
void applyMolListPropsToAtoms(ROMol &mol, const std::string &prefix, const std::string missingValueMarker="n/a")
applies all properties matching a particular prefix as an atom property list
Definition: FileParserUtils.h:107
RDKit::RDProps::getProp
void getProp(const std::string &key, T &res) const
allows retrieval of a particular property value
Definition: RDProps.h:99
BoostStartInclude.h
RDKit::RDProps::getPropList
STR_VECT getPropList(bool includePrivate=true, bool includeComputed=true) const
returns a list with the names of our properties
Definition: RDProps.h:36
RDKit::FileParserUtils::getAtomPropertyList
std::string getAtomPropertyList(ROMol &mol, const std::string &atomPropName, std::string missingValueMarker="", unsigned int lineSize=190)
Definition: FileParserUtils.h:153
RDKit::FileParserUtils::processMolPropertyList
void processMolPropertyList(ROMol &mol, const std::string pn, const std::string &missingValueMarker="n/a")
Definition: FileParserUtils.h:118
RDKIT_FILEPARSERS_EXPORT
#define RDKIT_FILEPARSERS_EXPORT
Definition: export.h:216
RDKit::RWMol
RWMol is a molecule class that is intended to be edited.
Definition: RWMol.h:31
RDKit::RDProps::setProp
void setProp(const std::string &key, T val, bool computed=false) const
sets a property value
Definition: RDProps.h:68
RDKit::FileParserUtils::ParseV3000CTAB
RDKIT_FILEPARSERS_EXPORT bool ParseV3000CTAB(std::istream *inStream, unsigned int &line, RWMol *mol, Conformer *&conf, bool &chiralityPossible, unsigned int &nAtoms, unsigned int &nBonds, bool strictParsing=true, bool expectMEND=true)
RDKit::Atom
The class for representing atoms.
Definition: Atom.h:69
BoostEndInclude.h
RDKit::ROMol::getAtomWithIdx
Atom * getAtomWithIdx(unsigned int idx)
returns a pointer to a particular Atom
RDKit::ROMol
Definition: ROMol.h:171
RDKit::FileParserUtils::replaceAtomWithQueryAtom
RDKIT_FILEPARSERS_EXPORT Atom * replaceAtomWithQueryAtom(RWMol *mol, Atom *atom)
RDKit::FileParserUtils::createAtomIntPropertyList
void createAtomIntPropertyList(ROMol &mol, const std::string &atomPropName, const std::string &missingValueMarker="", unsigned int lineSize=190)
Definition: FileParserUtils.h:186
RDKit::FileParserUtils::toInt
RDKIT_FILEPARSERS_EXPORT int toInt(const std::string &input, bool acceptSpaces=false)
RDKit::FileParserUtils::atomPropPrefix
static const std::string atomPropPrefix
Definition: FileParserUtils.h:115
RDKit::FileParserUtils::getV3000Line
RDKIT_FILEPARSERS_EXPORT std::string getV3000Line(std::istream *inStream, unsigned int &line)
RDKit::Conformer
The class for representing 2D or 3D conformation of a molecule.
Definition: Conformer.h:43
RDKit::FileParserUtils::applyMolListPropToAtoms
void applyMolListPropToAtoms(ROMol &mol, const std::string &pn, const std::string &prefix, const std::string &missingValueMarker="n/a")
applies a particular property to the atoms as an atom property list
Definition: FileParserUtils.h:65
RDKit::FileParserUtils::finishMolProcessing
RDKIT_FILEPARSERS_EXPORT void finishMolProcessing(RWMol *res, bool chiralityPossible, bool sanitize, bool removeHs)
finishes up the processing (sanitization, etc.) of a molecule read from CTAB
RDKit::FileParserUtils::processMolPropertyLists
void processMolPropertyLists(ROMol &mol, const std::string &missingValueMarker="n/a")
Definition: FileParserUtils.h:146
RDKit::ROMol::getNumAtoms
unsigned int getNumAtoms(bool onlyExplicit=1) const
returns our number of atoms
RDKit
Std stuff.
Definition: Atom.h:30
RDKit::FileParserUtils::createAtomBoolPropertyList
void createAtomBoolPropertyList(ROMol &mol, const std::string &atomPropName, const std::string &missingValueMarker="", unsigned int lineSize=190)
Definition: FileParserUtils.h:202
RDKit::MolOps::removeHs
RDKIT_GRAPHMOL_EXPORT ROMol * removeHs(const ROMol &mol, bool implicitOnly=false, bool updateExplicitCount=false, bool sanitize=true)
returns a copy of a molecule with hydrogens removed
RDKit::ROMol::atoms
CXXAtomIterator< MolGraph, Atom * > atoms()
C++11 Range iterator.
Definition: ROMol.h:249
RDKit::FileParserUtils::createAtomDoublePropertyList
void createAtomDoublePropertyList(ROMol &mol, const std::string &atomPropName, const std::string &missingValueMarker="", unsigned int lineSize=190)
Definition: FileParserUtils.h:194
rdWarningLog
RDKIT_RDGENERAL_EXPORT std::shared_ptr< boost::logging::rdLogger > rdWarningLog
RDKit::FileParserUtils::toDouble
RDKIT_FILEPARSERS_EXPORT double toDouble(const std::string &input, bool acceptSpaces=true)
RDKit::FileParserUtils::ParseV2000CTAB
RDKIT_FILEPARSERS_EXPORT bool ParseV2000CTAB(std::istream *inStream, unsigned int &line, RWMol *mol, Conformer *&conf, bool &chiralityPossible, unsigned int &nAtoms, unsigned int &nBonds, bool strictParsing=true)
RDKit::FileParserUtils::createAtomStringPropertyList
void createAtomStringPropertyList(ROMol &mol, const std::string &atomPropName, const std::string &missingValueMarker="", unsigned int lineSize=190)
Definition: FileParserUtils.h:210
RDKit::FileParserUtils::stripSpacesAndCast
T stripSpacesAndCast(const std::string &input, bool acceptSpaces=false)
Definition: FileParserUtils.h:28
export.h