RDKit
Open-source cheminformatics and machine learning.
StructChecker.h
Go to the documentation of this file.
1 //
2 // Copyright (C) 2016 Novartis Institutes for BioMedical Research
3 //
4 // @@ All Rights Reserved @@
5 // This file is part of the RDKit.
6 // The contents are covered by the terms of the BSD license
7 // which is included in the file license.txt, found at the root
8 // of the RDKit source tree.
9 //
10 
11 /*! \file StructChecker.h
12 
13 \brief Contains the public API of the StructChecker
14 
15 \b Note that this should be considered beta and that the API may change in
16 future
17 releases.
18 
19 */
20 #include <RDGeneral/export.h>
21 #pragma once
22 #ifndef RD_STRUCTCHECKER_H_Oct2016
23 #define RD_STRUCTCHECKER_H_Oct2016
24 
25 #include <string>
26 #include <vector>
27 #include "../RDKitBase.h"
28 
29 namespace RDKit {
30 namespace StructureCheck {
31 
32 // Flags for the return values of the StructureChecker
33 
34 // TypeDefs for translating augmented atom pairs
35 static const int ANY_CHARGE = 8;
37  RT_NONE = 0,
38  SINGLET = 1,
39  DOUBLET = 2,
40  TRIPLET = 3,
41  ANY_RADICAL = 0xFF
42 };
43 
44 enum AABondType { // MDL CTFile bond types plus extensions
45  BT_NONE = 0, // means REMOVE Bond
46  SINGLE = 1,
47  DOUBLE = 2,
48  TRIPLE = 3,
49  AROMATIC = 4,
53  ANY_BOND = 8,
55 };
56 
57 enum AATopology {
58  TP_NONE = 0, // Don't care
59  RING = 1, // Ring
60  CHAIN = 2 // Chain
61 };
62 
64  std::string AtomSymbol;
65  int Charge;
67  unsigned SubstitutionCount; // substitution count 0 = don't care
70  : Charge(ANY_CHARGE),
71  Radical(ANY_RADICAL),
72  SubstitutionCount(0),
73  BondType(ANY_BOND) {}
74 };
75 
77  std::string AtomSymbol;
78  std::string ShortName;
79  int Charge;
82  std::vector<Ligand> Ligands;
83 
85  : Charge(ANY_CHARGE), Radical(ANY_RADICAL), Topology(TP_NONE) {}
86 
87  AugmentedAtom(const std::string &symbol, const std::string &name, int charge,
88  RadicalType radical, AATopology topology)
89  : AtomSymbol(symbol),
90  ShortName(name),
91  Charge(charge),
92  Radical(radical),
93  Topology(topology) {}
94 };
95 
97  std::string AtomSymbol;
98  double LocalInc;
99  double AlphaInc;
100  double BetaInc;
101  double MultInc;
102 
103  // Used for logging
108 };
109 
112  double Cond;
113  // Used for logging
115 };
116 //-------------
117 
118 //! Structure Check Options
119 // Holds all the user options for the StructureChecking.
120 // Can be initialized from factory functions, perhaps serialized
122  double AcidityLimit;
127  unsigned MaxMolSize;
134  bool Verbose;
135 
136  // Internal data for struchk
137  std::vector<std::pair<AugmentedAtom, AugmentedAtom>> AugmentedAtomPairs;
138  std::vector<AugmentedAtom> AcidicAtoms;
139  std::vector<AugmentedAtom> GoodAtoms;
140  std::vector<ROMOL_SPTR> Patterns;
141  std::vector<ROMOL_SPTR> RotatePatterns;
142  std::vector<ROMOL_SPTR> StereoPatterns;
143  std::vector<ROMOL_SPTR> FromTautomer;
144  std::vector<ROMOL_SPTR> ToTautomer;
145 
146  double Elneg0; // elneg_table[0].value;
147  std::map<unsigned, double> ElnegTable; // AtomicNumber -> eleng
148  std::vector<IncEntry> AtomAcidity; // atom_acidity_table[]
149  std::vector<IncEntry> ChargeIncTable;
150  // std::map AtomSymbol(or AtomicNumber) -> IncEntry
151  /* [ReadTransformation() ]
152  * The alpha, beta coefficients of the transfomation function used
153  * to stretch the preliminary pKa values to the actual predictions.
154  * The function is pKa = 7 + (pKa'-7)*beta + ((pKa'-7)*alpha)^3.
155  */
156 
157  double Alpha, Beta;
158  std::vector<PathEntry> AlphaPathTable, BetaPathTable;
159 
160  public:
162 
163  void clear() { *this = StructCheckerOptions(); }
164 
165  bool loadAugmentedAtomTranslations(const std::string &path);
166  void setAugmentedAtomTranslations(
167  const std::vector<std::pair<AugmentedAtom, AugmentedAtom>> &aaPairs);
168 
169  bool loadAcidicAugmentedAtoms(const std::string &path);
170  void setAcidicAugmentedAtoms(const std::vector<AugmentedAtom> &acidicAtoms);
171 
172  bool loadGoodAugmentedAtoms(const std::string &path);
173  void setGoodAugmentedAtoms(const std::vector<AugmentedAtom> &acidicAtoms);
174 
175  bool loadPatterns(const std::string &path); // file with clean patterns
176  void parsePatterns(
177  const std::vector<std::string> &smarts); // can throw RDKit exeptions
178  void setPatterns(const std::vector<ROMOL_SPTR> &p);
179 
180  bool loadRotatePatterns(
181  const std::string &path); // file with rotate patterns
182  void parseRotatePatterns(
183  const std::vector<std::string> &smarts); // can throw RDKit exeptions
184  void setRotatePatterns(const std::vector<ROMOL_SPTR> &p);
185 
186  bool loadStereoPatterns(
187  const std::string &path); // file with stereo patterns
188  void parseStereoPatterns(
189  const std::vector<std::string> &smarts); // can throw RDKit exeptions
190  void setStereoPatterns(const std::vector<ROMOL_SPTR> &p);
191 
192  bool loadTautomerData(const std::string &path); // file path
193  void parseTautomerData(const std::vector<std::string> &smartsFrom,
194  const std::vector<std::string> &smartsTo);
195  void setTautomerData(const std::vector<ROMOL_SPTR> &from,
196  const std::vector<ROMOL_SPTR> &to);
197  bool loadChargeDataTables(const std::string &path); // file path
198 };
199 
200 RDKIT_STRUCTCHECKER_EXPORT bool parseOptionsJSON(const std::string &json,
201  StructCheckerOptions &op);
202 
204  StructCheckerOptions &op,
205  const std::string &augmentedAtomTranslationsFile = "",
206  // ?? AcidicAtoms;
207  // ?? GoodAtoms;
208  const std::string &patternFile = "", // file with clean patterns
209  const std::string &rotatePatternFile = "", // file with rotate patterns
210  const std::string &stereoPatternFile = "", // file with stereo patterns
211  const std::string &tautomerFile = "");
212 
213 //! \brief Class for performing structure validation and cleanup
214 /*! \b NOTE: This class should be considered beta. The API may change in future
215 releases.
216 
217 Examples of Usage
218 
219 \code
220  StructChecker chk;
221  int flags = StructureCheck::checkMolStructure( mol ); // use defaults
222 \endcode
223 
224 or
225 
226 \code
227  StructureCheck::StructCheckerOptions options; // use defaults
228  // To use external data
229  StructureCheck::loadOptionsFromFiles(options, file1, file2);
230  StructChecker chk(options);
231 
232  for( mol in mols ) {
233  int flags = StructureCheck::checkMolStructure( mol, &options);
234  if (0!=(flags & StructureCheck::StructureFlags::BAD_SET)) {
235  // write to error file
236  } else if (0!=(flags & StructureCheck::StructureFlags::TRANSFORMED_SET))
237 {
238  // input molecule was transformed
239  } else { // flag == NO_CHANGE
240  // no change
241  }
242  }
243 \endcode
244 */
246  public:
247  typedef enum StructureFlags {
248  NO_CHANGE = 0,
249  BAD_MOLECULE = 0x0001,
250  ALIAS_CONVERSION_FAILED = 0x0002,
251  STEREO_ERROR = 0x0004,
252  STEREO_FORCED_BAD = 0x0008,
253  ATOM_CLASH = 0x0010,
254  ATOM_CHECK_FAILED = 0x0020,
255  SIZE_CHECK_FAILED = 0x0040,
256  // reserved error = 0x0080,
257  TRANSFORMED = 0x0100,
258  FRAGMENTS_FOUND = 0x0200,
259  EITHER_WARNING = 0x0400,
260  DUBIOUS_STEREO_REMOVED = 0x0800,
261  RECHARGED = 0x1000,
262  STEREO_TRANSFORMED = 0x2000,
263  TEMPLATE_TRANSFORMED = 0x4000,
264  TAUTOMER_TRANSFORMED = 0x8000,
265  // mask:
266  BAD_SET = (BAD_MOLECULE | ALIAS_CONVERSION_FAILED | STEREO_ERROR |
267  STEREO_FORCED_BAD | ATOM_CLASH | ATOM_CHECK_FAILED |
268  SIZE_CHECK_FAILED),
269 
270  TRANSFORMED_SET = (TRANSFORMED | FRAGMENTS_FOUND | EITHER_WARNING |
271  DUBIOUS_STEREO_REMOVED | STEREO_TRANSFORMED |
272  TEMPLATE_TRANSFORMED | TAUTOMER_TRANSFORMED | RECHARGED),
273  } StructureFlags;
274  // attributes:
275  private:
276  StructCheckerOptions Options;
277 
278  public:
279  inline StructChecker() {}
280  inline StructChecker(const StructCheckerOptions &options)
281  : Options(options) {}
282 
283  const StructCheckerOptions &GetOptions() const { return Options; }
284  void SetOptions(const StructCheckerOptions &options) { Options = options; }
285 
286  // Check and fix (if need) molecule structure and return a set of
287  // StructureFlags
288  // that describes what have been done
289  unsigned checkMolStructure(RWMol &mol) const;
290 
291  // an instance independed helper methods:
292  // Converts structure property flags to a comma seperated string
293  static std::string StructureFlagsToString(unsigned flags);
294  // Converts a comma seperated string to a StructureFlag unsigned integer
295  static unsigned StringToStructureFlags(const std::string &str);
296  // internal implementation:
297  private:
298 };
299 } // namespace StructureCheck
300 } // namespace RDKit
301 #endif
RDKit::StructureCheck::Ligand::Ligand
Ligand()
Definition: StructChecker.h:69
RDKit::StructureCheck::TP_NONE
@ TP_NONE
Definition: StructChecker.h:58
RDKit::StructureCheck::IncEntry::beta_inc_used
int beta_inc_used
Definition: StructChecker.h:106
RDKit::StructureCheck::StructCheckerOptions::Beta
double Beta
Definition: StructChecker.h:157
RDKit::StructureCheck::StructCheckerOptions::AugmentedAtomPairs
std::vector< std::pair< AugmentedAtom, AugmentedAtom > > AugmentedAtomPairs
Definition: StructChecker.h:137
RDKit::StructureCheck::AugmentedAtom::ShortName
std::string ShortName
Definition: StructChecker.h:78
RDKit::StructureCheck::AugmentedAtom::Topology
AATopology Topology
Definition: StructChecker.h:81
RDKit::StructureCheck::StructChecker::StructureFlags
StructureFlags
Definition: StructChecker.h:247
RDKit::StructureCheck::Ligand::Charge
int Charge
Definition: StructChecker.h:65
RDKit::StructureCheck::StructCheckerOptions::Elneg0
double Elneg0
Definition: StructChecker.h:146
RDKit::StructureCheck::StructCheckerOptions::ConvertAtomTexts
bool ConvertAtomTexts
Definition: StructChecker.h:132
RDKit::StructureCheck::StructCheckerOptions::FromTautomer
std::vector< ROMOL_SPTR > FromTautomer
Definition: StructChecker.h:143
RDKit::StructureCheck::StructCheckerOptions::ChargeIncTable
std::vector< IncEntry > ChargeIncTable
Definition: StructChecker.h:149
RDKit::StructureCheck::StructCheckerOptions::CollisionLimitPercent
int CollisionLimitPercent
Definition: StructChecker.h:126
RDKit::StructureCheck::StructCheckerOptions
Structure Check Options.
Definition: StructChecker.h:121
RDKit::StructureCheck::IncEntry::mult_inc_used
int mult_inc_used
Definition: StructChecker.h:107
RDKit::StructureCheck::StructChecker::StructChecker
StructChecker(const StructCheckerOptions &options)
Definition: StructChecker.h:280
RDKit::StructureCheck::AROMATIC
@ AROMATIC
Definition: StructChecker.h:49
RDKit::RWMol
RWMol is a molecule class that is intended to be edited.
Definition: RWMol.h:31
RDKit::StructureCheck::IncEntry::local_inc_used
int local_inc_used
Definition: StructChecker.h:104
RDKit::StructureCheck::AABondType
AABondType
Definition: StructChecker.h:44
RDKit::StructureCheck::StructCheckerOptions::DesiredCharge
int DesiredCharge
Definition: StructChecker.h:124
RDKit::StructureCheck::StructChecker::SetOptions
void SetOptions(const StructCheckerOptions &options)
Definition: StructChecker.h:284
RDKit::StructureCheck::AugmentedAtom::AugmentedAtom
AugmentedAtom(const std::string &symbol, const std::string &name, int charge, RadicalType radical, AATopology topology)
Definition: StructChecker.h:87
RDKit::StructureCheck::AugmentedAtom::AugmentedAtom
AugmentedAtom()
Definition: StructChecker.h:84
RDKit::StructureCheck::StructCheckerOptions::AtomAcidity
std::vector< IncEntry > AtomAcidity
Definition: StructChecker.h:148
RDKit::StructureCheck::ANY_RADICAL
@ ANY_RADICAL
Definition: StructChecker.h:41
RDKit::StructureCheck::Ligand
Definition: StructChecker.h:63
RDKit::StructureCheck::SINGLE_AROMATIC
@ SINGLE_AROMATIC
Definition: StructChecker.h:51
RDKit::StructureCheck::loadOptionsFromFiles
RDKIT_STRUCTCHECKER_EXPORT bool loadOptionsFromFiles(StructCheckerOptions &op, const std::string &augmentedAtomTranslationsFile="", const std::string &patternFile="", const std::string &rotatePatternFile="", const std::string &stereoPatternFile="", const std::string &tautomerFile="")
RDKit::StructureCheck::DOUBLE_AROMATIC
@ DOUBLE_AROMATIC
Definition: StructChecker.h:52
RDKit::StructureCheck::DOUBLET
@ DOUBLET
Definition: StructChecker.h:39
RDKit::StructureCheck::AATopology
AATopology
Definition: StructChecker.h:57
RDKit::StructureCheck::StructCheckerOptions::ConvertSText
bool ConvertSText
Definition: StructChecker.h:128
RDKit::StructureCheck::StructChecker::GetOptions
const StructCheckerOptions & GetOptions() const
Definition: StructChecker.h:283
RDKit::StructureCheck::StructCheckerOptions::RemoveMinorFragments
bool RemoveMinorFragments
Definition: StructChecker.h:123
RDKit::StructureCheck::TRIPLET
@ TRIPLET
Definition: StructChecker.h:40
RDKit::StructureCheck::IncEntry::AlphaInc
double AlphaInc
Definition: StructChecker.h:99
RDKit::StructureCheck::PathEntry::Cond
double Cond
Definition: StructChecker.h:112
RDKit::StructureCheck::Ligand::Radical
RadicalType Radical
Definition: StructChecker.h:66
RDKit::StructureCheck::StructCheckerOptions::CheckCollisions
bool CheckCollisions
Definition: StructChecker.h:125
RDKit::StructureCheck::IncEntry::alpha_inc_used
int alpha_inc_used
Definition: StructChecker.h:105
RDKit::StructureCheck::parseOptionsJSON
RDKIT_STRUCTCHECKER_EXPORT bool parseOptionsJSON(const std::string &json, StructCheckerOptions &op)
RDKit::StructureCheck::RING
@ RING
Definition: StructChecker.h:59
RDKit::StructureCheck::StructCheckerOptions::Verbose
bool Verbose
Definition: StructChecker.h:134
RDKit::StructureCheck::ANY_BOND
@ ANY_BOND
Definition: StructChecker.h:53
RDKit::StructureCheck::Ligand::SubstitutionCount
unsigned SubstitutionCount
Definition: StructChecker.h:67
RDKit::StructureCheck::StructCheckerOptions::RotatePatterns
std::vector< ROMOL_SPTR > RotatePatterns
Definition: StructChecker.h:141
RDKit::StructureCheck::CHAIN
@ CHAIN
Definition: StructChecker.h:60
RDKit::StructureCheck::DOUBLE
@ DOUBLE
Definition: StructChecker.h:47
symbol
static const char * symbol[119]
Definition: mf.h:259
RDKit::StructureCheck::AugmentedAtom::AtomSymbol
std::string AtomSymbol
Definition: StructChecker.h:77
RDKit::StructureCheck::IncEntry::AtomSymbol
std::string AtomSymbol
Definition: StructChecker.h:97
RDKit::StructureCheck::StructCheckerOptions::StripZeros
bool StripZeros
Definition: StructChecker.h:130
RDKit::StructureCheck::RT_NONE
@ RT_NONE
Definition: StructChecker.h:37
RDKit::StructureCheck::StructCheckerOptions::GroupsToSGroups
bool GroupsToSGroups
Definition: StructChecker.h:133
RDKit::StructureCheck::StructCheckerOptions::BetaPathTable
std::vector< PathEntry > BetaPathTable
Definition: StructChecker.h:158
RDKit::StructureCheck::AugmentedAtom
Definition: StructChecker.h:76
RDKit::StructureCheck::AugmentedAtom::Radical
RadicalType Radical
Definition: StructChecker.h:80
RDKit::StructureCheck::BT_NONE
@ BT_NONE
Definition: StructChecker.h:45
RDKit::StructureCheck::StructCheckerOptions::AcidicAtoms
std::vector< AugmentedAtom > AcidicAtoms
Definition: StructChecker.h:138
RDKit::StructureCheck::ANY_CHARGE
static const int ANY_CHARGE
Definition: StructChecker.h:35
RDKit
Std stuff.
Definition: Atom.h:30
RDKit::StructureCheck::RadicalType
RadicalType
Definition: StructChecker.h:36
RDKit::StructureCheck::AugmentedAtom::Charge
int Charge
Definition: StructChecker.h:79
RDKit::StructureCheck::SINGLE
@ SINGLE
Definition: StructChecker.h:46
RDKit::StructureCheck::AugmentedAtom::Ligands
std::vector< Ligand > Ligands
Definition: StructChecker.h:82
RDKit::StructureCheck::PathEntry
Definition: StructChecker.h:110
RDKit::StructureCheck::IncEntry::BetaInc
double BetaInc
Definition: StructChecker.h:100
RDKit::StructureCheck::PathEntry::cond_used
int cond_used
Definition: StructChecker.h:114
RDKit::StructureCheck::Ligand::BondType
AABondType BondType
Definition: StructChecker.h:68
RDKit::StructureCheck::StructChecker
Class for performing structure validation and cleanup.
Definition: StructChecker.h:245
RDKit::StructureCheck::StructChecker::StructChecker
StructChecker()
Definition: StructChecker.h:279
RDKit::StructureCheck::StructCheckerOptions::GoodAtoms
std::vector< AugmentedAtom > GoodAtoms
Definition: StructChecker.h:139
RDKit::StructureCheck::ALL_BOND_TYPES
@ ALL_BOND_TYPES
Definition: StructChecker.h:54
RDKit::StructureCheck::StructCheckerOptions::SqueezeIdentifiers
bool SqueezeIdentifiers
Definition: StructChecker.h:129
RDKit::StructureCheck::StructCheckerOptions::AcidityLimit
double AcidityLimit
Definition: StructChecker.h:122
RDKit::StructureCheck::StructCheckerOptions::clear
void clear()
Definition: StructChecker.h:163
RDKit::StructureCheck::StructCheckerOptions::Patterns
std::vector< ROMOL_SPTR > Patterns
Definition: StructChecker.h:140
RDKit::StructureCheck::IncEntry::LocalInc
double LocalInc
Definition: StructChecker.h:98
RDKit::StructureCheck::SINGLET
@ SINGLET
Definition: StructChecker.h:38
RDKit::StructureCheck::StructCheckerOptions::MaxMolSize
unsigned MaxMolSize
Definition: StructChecker.h:127
RDKIT_STRUCTCHECKER_EXPORT
#define RDKIT_STRUCTCHECKER_EXPORT
Definition: export.h:645
RDKit::StructureCheck::StructCheckerOptions::ToTautomer
std::vector< ROMOL_SPTR > ToTautomer
Definition: StructChecker.h:144
RDKit::StructureCheck::StructCheckerOptions::StereoPatterns
std::vector< ROMOL_SPTR > StereoPatterns
Definition: StructChecker.h:142
RDKit::StructureCheck::PathEntry::Path
AugmentedAtom Path
Definition: StructChecker.h:111
RDKit::StructureCheck::TRIPLE
@ TRIPLE
Definition: StructChecker.h:48
RDKit::StructureCheck::IncEntry
Definition: StructChecker.h:96
RDKit::StructureCheck::StructCheckerOptions::ElnegTable
std::map< unsigned, double > ElnegTable
Definition: StructChecker.h:147
RDKit::StructureCheck::IncEntry::MultInc
double MultInc
Definition: StructChecker.h:101
RDKit::StructureCheck::SINGLE_DOUBLE
@ SINGLE_DOUBLE
Definition: StructChecker.h:50
RDKit::StructureCheck::Ligand::AtomSymbol
std::string AtomSymbol
Definition: StructChecker.h:64
RDKit::StructureCheck::StructCheckerOptions::CheckStereo
bool CheckStereo
Definition: StructChecker.h:131
export.h