RDKit
Open-source cheminformatics and machine learning.
python_streambuf.h
Go to the documentation of this file.
1 //
2 // This file is part of the CCTBX distribution:
3 // http://cctbx.sourceforge.net/
4 // Downloaded from here:
5 // http://cctbx.svn.sourceforge.net/viewvc/cctbx/trunk/boost_adaptbx/python_streambuf.h?revision=13619
6 //
7 // Copyright (c) 2006, The Regents of the University of
8 // California, through Lawrence Berkeley National Laboratory (subject to
9 // receipt of any required approvals from the U.S. Dept. of Energy). All
10 // rights reserved.
11 //
12 // The license is here:
13 // http://cctbx.svn.sourceforge.net/viewvc/cctbx/trunk/boost_adaptbx/LICENSE_2_0.txt?revision=5148
14 //
15 #include <RDGeneral/export.h>
16 #ifndef BOOST_ADAPTBX_PYTHON_STREAMBUF_H
17 #define BOOST_ADAPTBX_PYTHON_STREAMBUF_H
19 #include <boost/python/object.hpp>
20 #include <boost/python/str.hpp>
21 #include <boost/python/extract.hpp>
22 
23 #include <boost/optional.hpp>
24 #include <boost/utility/typed_in_place_factory.hpp>
26 
27 //#include <tbxx/error_utils.hpp>
28 #include <RDGeneral/Invariant.h>
29 #include <RDGeneral/Exceptions.h>
30 
31 #include <streambuf>
32 #include <iostream>
33 
34 namespace boost_adaptbx {
35 namespace python {
36 
37 namespace bp = boost::python;
38 
39 /// A stream buffer getting data from and putting data into a Python file object
40 /** The aims are as follow:
41 
42  - Given a C++ function acting on a standard stream, e.g.
43 
44  \code
45  void read_inputs(std::istream& input) {
46  ...
47  input >> something >> something_else;
48  }
49  \endcode
50 
51  and given a piece of Python code which creates a file-like object,
52  to be able to pass this file object to that C++ function, e.g.
53 
54  \code
55  import gzip
56  gzip_file_obj = gzip.GzipFile(...)
57  read_inputs(gzip_file_obj)
58  \endcode
59 
60  and have the standard stream pull data from and put data into the Python
61  file object.
62 
63  - When Python \c read_inputs() returns, the Python object is able to
64  continue reading or writing where the C++ code left off.
65 
66  - Operations in C++ on mere files should be competitively fast compared
67  to the direct use of \c std::fstream.
68 
69 
70  \b Motivation
71 
72  - the standard Python library offer of file-like objects (files,
73  compressed files and archives, network, ...) is far superior to the
74  offer of streams in the C++ standard library and Boost C++ libraries.
75 
76  - i/o code involves a fair amount of text processing which is more
77  efficiently prototyped in Python but then one may need to rewrite
78  a time-critical part in C++, in as seamless a manner as possible.
79 
80  \b Usage
81 
82  This is 2-step:
83 
84  - a trivial wrapper function
85 
86  \code
87  using boost_adaptbx::python::streambuf;
88  void read_inputs_wrapper(streambuf& input)
89  {
90  streambuf::istream is(input);
91  read_inputs(is);
92  }
93 
94  def("read_inputs", read_inputs_wrapper);
95  \endcode
96 
97  which has to be written every time one wants a Python binding for
98  such a C++ function.
99 
100  - the Python side
101 
102  \code
103  from boost.python import streambuf
104  read_inputs(streambuf(python_file_obj=obj, buffer_size=1024))
105  \endcode
106 
107  \c buffer_size is optional. See also: \c default_buffer_size
108 
109  Note: references are to the C++ standard (the numbers between parentheses
110  at the end of references are margin markers).
111 */
112 class streambuf : public std::basic_streambuf<char> {
113  private:
114  typedef std::basic_streambuf<char> base_t;
115 
116  public:
117  /* The syntax
118  using base_t::char_type;
119  would be nicer but Visual Studio C++ 8 chokes on it
120  */
121  typedef base_t::char_type char_type;
122  typedef base_t::int_type int_type;
123  typedef base_t::pos_type pos_type;
124  typedef base_t::off_type off_type;
125  typedef base_t::traits_type traits_type;
126 
127  // work around Visual C++ 7.1 problem
128  inline static int traits_type_eof() { return traits_type::eof(); }
129 
130  /// The default size of the read and write buffer.
131  /** They are respectively used to buffer data read from and data written to
132  the Python file object. It can be modified from Python.
133  */
134  const static std::size_t default_buffer_size = 1024;
135 
136  /// Construct from a Python file object
137  /** if buffer_size is 0 the current default_buffer_size is used.
138  */
139  streambuf(bp::object& python_file_obj, std::size_t buffer_size_ = 0)
140  : py_read(getattr(python_file_obj, "read", bp::object())),
141  py_write(getattr(python_file_obj, "write", bp::object())),
142  py_seek(getattr(python_file_obj, "seek", bp::object())),
143  py_tell(getattr(python_file_obj, "tell", bp::object())),
144  buffer_size(buffer_size_ != 0 ? buffer_size_ : default_buffer_size),
145  write_buffer(0),
146  pos_of_read_buffer_end_in_py_file(0),
147  pos_of_write_buffer_end_in_py_file(buffer_size),
148  farthest_pptr(0) {
149  TEST_ASSERT(buffer_size != 0);
150  /* Some Python file objects (e.g. sys.stdout and sys.stdin)
151  have non-functional seek and tell. If so, assign None to
152  py_tell and py_seek.
153  */
154  if (py_tell != bp::object()) {
155  try {
156  off_type py_pos = bp::extract<off_type>(py_tell());
157  if (py_seek != bp::object()) {
158  /* Make sure we can actually seek.
159  bzip2 readers from python have a seek method, but it fails
160  when they are in write mode.
161  */
162  py_seek(py_pos);
163  }
164  } catch (bp::error_already_set&) {
165  py_tell = bp::object();
166  py_seek = bp::object();
167  /* Boost.Python does not do any Python exception handling whatsoever
168  So we need to catch it by hand like so.
169  */
170  PyErr_Clear();
171  }
172  }
173 
174  if (py_write != bp::object()) {
175  // C-like string to make debugging easier
176  write_buffer = new char[buffer_size + 1];
177  write_buffer[buffer_size] = '\0';
178  setp(write_buffer, write_buffer + buffer_size); // 27.5.2.4.5 (5)
179  farthest_pptr = pptr();
180  } else {
181  // The first attempt at output will result in a call to overflow
182  setp(0, 0);
183  }
184 
185  if (py_tell != bp::object()) {
186  off_type py_pos = bp::extract<off_type>(py_tell());
187  pos_of_read_buffer_end_in_py_file = py_pos;
188  pos_of_write_buffer_end_in_py_file = py_pos;
189  }
190  }
191 
192  /// constructor to enforce a mode (binary or text)
193  streambuf(bp::object& python_file_obj, char mode,
194  std::size_t buffer_size_ = 0)
195  : streambuf(python_file_obj, buffer_size_) {
196 #if 1
197  bp::object io_mod = bp::import("io");
198  CHECK_INVARIANT(io_mod,"module not found");
199  bp::object iobase = io_mod.attr("TextIOBase");;
200  CHECK_INVARIANT(iobase,"base class not found");
201 #else
202  // using statics to save an undetermined amount of time results in
203  // alarming seg faults on windows. so we don't do it. Keep this here
204  // for the moment though in case someone manages to figure that out in
205  // the future
206  static bp::object io_mod = bp::object();
207  static bp::object iobase = bp::object();
208  if(!io_mod) io_mod = bp::import("io");
209  if(io_mod && !iobase) iobase = io_mod.attr("TextIOBase");
210  CHECK_INVARIANT(io_mod,"module not found");
211  CHECK_INVARIANT(iobase,"base class not found");
212 #endif
213 
214  bool isTextMode = PyObject_IsInstance(python_file_obj.ptr(), iobase.ptr());
215  switch (mode) {
216  case 's': /// yeah, is redundant, but it is somehow natural to do "s"
217  case 't':
218  if (!isTextMode)
219  throw ValueErrorException(
220  "Need a text mode file object like StringIO or a file opened "
221  "with mode 't'");
222  break;
223  case 'b':
224  if (isTextMode)
225  throw ValueErrorException(
226  "Need a binary mode file object like BytesIO or a file opened "
227  "with mode 'b'");
228  break;
229  default:
230  throw std::invalid_argument("bad mode character");
231  }
232  }
233 
234  /// Mundane destructor freeing the allocated resources
235  virtual ~streambuf() {
236  if (write_buffer) delete[] write_buffer;
237  }
238 
239  /// C.f. C++ standard section 27.5.2.4.3
240  /** It is essential to override this virtual function for the stream
241  member function readsome to work correctly (c.f. 27.6.1.3, alinea 30)
242  */
243  virtual std::streamsize showmanyc() {
244  int_type const failure = traits_type::eof();
245  int_type status = underflow();
246  if (status == failure) return -1;
247  return egptr() - gptr();
248  }
249 
250  /// C.f. C++ standard section 27.5.2.4.3
251  virtual int_type underflow() {
252  int_type const failure = traits_type::eof();
253  if (py_read == bp::object()) {
254  throw std::invalid_argument(
255  "That Python file object has no 'read' attribute");
256  }
257  read_buffer = py_read(buffer_size);
258  char* read_buffer_data;
259  bp::ssize_t py_n_read;
260  if (PyBytes_AsStringAndSize(read_buffer.ptr(), &read_buffer_data,
261  &py_n_read) == -1) {
262  setg(0, 0, 0);
263  throw std::invalid_argument(
264  "The method 'read' of the Python file object "
265  "did not return a string.");
266  }
267  off_type n_read = (off_type)py_n_read;
268  pos_of_read_buffer_end_in_py_file += n_read;
269  setg(read_buffer_data, read_buffer_data, read_buffer_data + n_read);
270  // ^^^27.5.2.3.1 (4)
271  if (n_read == 0) return failure;
272  return traits_type::to_int_type(read_buffer_data[0]);
273  }
274 
275  /// C.f. C++ standard section 27.5.2.4.5
277  if (py_write == bp::object()) {
278  throw std::invalid_argument(
279  "That Python file object has no 'write' attribute");
280  }
281  farthest_pptr = std::max(farthest_pptr, pptr());
282  off_type n_written = (off_type)(farthest_pptr - pbase());
283  bp::str chunk(pbase(), farthest_pptr);
284  py_write(chunk);
285  if (!traits_type::eq_int_type(c, traits_type::eof())) {
286  py_write(traits_type::to_char_type(c));
287  n_written++;
288  }
289  if (n_written) {
290  pos_of_write_buffer_end_in_py_file += n_written;
291  setp(pbase(), epptr());
292  // ^^^ 27.5.2.4.5 (5)
293  farthest_pptr = pptr();
294  }
295  return traits_type::eq_int_type(c, traits_type::eof())
296  ? traits_type::not_eof(c)
297  : c;
298  }
299 
300  /// Update the python file to reflect the state of this stream buffer
301  /** Empty the write buffer into the Python file object and set the seek
302  position of the latter accordingly (C++ standard section 27.5.2.4.2).
303  If there is no write buffer or it is empty, but there is a non-empty
304  read buffer, set the Python file object seek position to the
305  seek position in that read buffer.
306  */
307  virtual int sync() {
308  int result = 0;
309  farthest_pptr = std::max(farthest_pptr, pptr());
310  if (farthest_pptr && farthest_pptr > pbase()) {
311  off_type delta = pptr() - farthest_pptr;
312  int_type status = overflow();
313  if (traits_type::eq_int_type(status, traits_type::eof())) result = -1;
314  if (py_seek != bp::object()) py_seek(delta, 1);
315  } else if (gptr() && gptr() < egptr()) {
316  if (py_seek != bp::object()) py_seek(gptr() - egptr(), 1);
317  }
318  return result;
319  }
320 
321  /// C.f. C++ standard section 27.5.2.4.2
322  /** This implementation is optimised to look whether the position is within
323  the buffers, so as to avoid calling Python seek or tell. It is
324  important for many applications that the overhead of calling into Python
325  is avoided as much as possible (e.g. parsers which may do a lot of
326  backtracking)
327  */
328  virtual pos_type seekoff(off_type off, std::ios_base::seekdir way,
329  std::ios_base::openmode which = std::ios_base::in |
330  std::ios_base::out) {
331  /* In practice, "which" is either std::ios_base::in or out
332  since we end up here because either seekp or seekg was called
333  on the stream using this buffer. That simplifies the code
334  in a few places.
335  */
336  int const failure = off_type(-1);
337 
338  if (py_seek == bp::object()) {
339  throw std::invalid_argument(
340  "That Python file object has no 'seek' attribute");
341  }
342 
343  // we need the read buffer to contain something!
344  if (which == std::ios_base::in && !gptr()) {
345  if (traits_type::eq_int_type(underflow(), traits_type::eof())) {
346  return failure;
347  }
348  }
349 
350  // compute the whence parameter for Python seek
351  int whence;
352  switch (way) {
353  case std::ios_base::beg:
354  whence = 0;
355  break;
356  case std::ios_base::cur:
357  whence = 1;
358  break;
359  case std::ios_base::end:
360  whence = 2;
361  break;
362  default:
363  return failure;
364  }
365 
366  // Let's have a go
367  boost::optional<off_type> result =
368  seekoff_without_calling_python(off, way, which);
369  if (!result) {
370  // we need to call Python
371  if (which == std::ios_base::out) overflow();
372  if (way == std::ios_base::cur) {
373  if (which == std::ios_base::in)
374  off -= egptr() - gptr();
375  else if (which == std::ios_base::out)
376  off += pptr() - pbase();
377  }
378  py_seek(off, whence);
379  result = off_type(bp::extract<off_type>(py_tell()));
380  if (which == std::ios_base::in) underflow();
381  }
382  return *result;
383  }
384 
385  /// C.f. C++ standard section 27.5.2.4.2
387  std::ios_base::openmode which = std::ios_base::in |
388  std::ios_base::out) {
389  return streambuf::seekoff(sp, std::ios_base::beg, which);
390  }
391 
392  private:
393  bp::object py_read, py_write, py_seek, py_tell;
394 
395  std::size_t buffer_size;
396 
397  /* This is actually a Python string and the actual read buffer is
398  its internal data, i.e. an array of characters. We use a Boost.Python
399  object so as to hold on it: as a result, the actual buffer can't
400  go away.
401  */
402  bp::object read_buffer;
403 
404  /* A mere array of char's allocated on the heap at construction time and
405  de-allocated only at destruction time.
406  */
407  char* write_buffer;
408 
409  off_type pos_of_read_buffer_end_in_py_file,
410  pos_of_write_buffer_end_in_py_file;
411 
412  // the farthest place the buffer has been written into
413  char* farthest_pptr;
414 
415  boost::optional<off_type> seekoff_without_calling_python(
416  off_type off, std::ios_base::seekdir way, std::ios_base::openmode which) {
417  boost::optional<off_type> const failure;
418 
419  // Buffer range and current position
420  off_type buf_begin, buf_end, buf_cur, upper_bound;
421  off_type pos_of_buffer_end_in_py_file;
422  if (which == std::ios_base::in) {
423  pos_of_buffer_end_in_py_file = pos_of_read_buffer_end_in_py_file;
424  buf_begin = reinterpret_cast<std::streamsize>(eback());
425  buf_cur = reinterpret_cast<std::streamsize>(gptr());
426  buf_end = reinterpret_cast<std::streamsize>(egptr());
427  upper_bound = buf_end;
428  } else if (which == std::ios_base::out) {
429  pos_of_buffer_end_in_py_file = pos_of_write_buffer_end_in_py_file;
430  buf_begin = reinterpret_cast<std::streamsize>(pbase());
431  buf_cur = reinterpret_cast<std::streamsize>(pptr());
432  buf_end = reinterpret_cast<std::streamsize>(epptr());
433  farthest_pptr = std::max(farthest_pptr, pptr());
434  upper_bound = reinterpret_cast<std::streamsize>(farthest_pptr) + 1;
435  } else {
436  CHECK_INVARIANT(0, "unreachable code");
437  }
438 
439  // Sought position in "buffer coordinate"
440  off_type buf_sought;
441  if (way == std::ios_base::cur) {
442  buf_sought = buf_cur + off;
443  } else if (way == std::ios_base::beg) {
444  buf_sought = buf_end + (off - pos_of_buffer_end_in_py_file);
445  } else if (way == std::ios_base::end) {
446  return failure;
447  } else {
448  CHECK_INVARIANT(0, "unreachable code");
449  }
450 
451  // if the sought position is not in the buffer, give up
452  if (buf_sought < buf_begin || buf_sought >= upper_bound) return failure;
453 
454  // we are in wonderland
455  if (which == std::ios_base::in)
456  gbump(buf_sought - buf_cur);
457  else if (which == std::ios_base::out)
458  pbump(buf_sought - buf_cur);
459  return pos_of_buffer_end_in_py_file + (buf_sought - buf_end);
460  }
461 
462  public:
463  class istream : public std::istream {
464  public:
465  istream(streambuf& buf) : std::istream(&buf) {
466  exceptions(std::ios_base::badbit);
467  }
468 
470  // do nothing.
471  // This used to do:
472  // if (this->good()) this->sync();
473  // but that caused problems if the underlying file had been closed
474  // (see github #579) and really doesn't seem necessary for what we're
475  // doing.
476  }
477  };
478 
479  class ostream : public std::ostream {
480  public:
481  ostream(streambuf& buf) : std::ostream(&buf) {
482  exceptions(std::ios_base::badbit);
483  }
484 
486  if (this->good()) this->flush();
487  }
488  };
489 };
490 
491 // std::size_t streambuf::default_buffer_size = 1024;
492 
495 
496  streambuf_capsule(bp::object& python_file_obj, std::size_t buffer_size = 0)
497  : python_streambuf(python_file_obj, buffer_size) {}
498 };
499 
501  ostream(bp::object& python_file_obj, std::size_t buffer_size = 0)
502  : streambuf_capsule(python_file_obj, buffer_size),
504 
505  ~ostream() throw() {
506  try {
507  if (this->good()) this->flush();
508  } catch (bp::error_already_set&) {
509  PyErr_Clear();
510  throw std::runtime_error(
511  "Problem closing python ostream.\n"
512  " Known limitation: the error is unrecoverable. Sorry.\n"
513  " Suggestion for programmer: add ostream.flush() before"
514  " returning.");
515  }
516  }
517 };
518 } // namespace python
519 } // namespace boost_adaptbx
520 
521 #endif // GUARD
boost_adaptbx::python::ostream
Definition: python_streambuf.h:500
boost_adaptbx::python::streambuf::seekpos
virtual pos_type seekpos(pos_type sp, std::ios_base::openmode which=std::ios_base::in|std::ios_base::out)
C.f. C++ standard section 27.5.2.4.2.
Definition: python_streambuf.h:386
boost_adaptbx::python::streambuf
A stream buffer getting data from and putting data into a Python file object.
Definition: python_streambuf.h:112
boost_adaptbx::python::streambuf::streambuf
streambuf(bp::object &python_file_obj, char mode, std::size_t buffer_size_=0)
constructor to enforce a mode (binary or text)
Definition: python_streambuf.h:193
boost_adaptbx::python::streambuf::seekoff
virtual pos_type seekoff(off_type off, std::ios_base::seekdir way, std::ios_base::openmode which=std::ios_base::in|std::ios_base::out)
C.f. C++ standard section 27.5.2.4.2.
Definition: python_streambuf.h:328
boost_adaptbx::python::streambuf::traits_type
base_t::traits_type traits_type
Definition: python_streambuf.h:125
boost_adaptbx::python::streambuf_capsule::python_streambuf
streambuf python_streambuf
Definition: python_streambuf.h:494
BoostStartInclude.h
CHECK_INVARIANT
#define CHECK_INVARIANT(expr, mess)
Definition: Invariant.h:101
boost_adaptbx::python::streambuf::~streambuf
virtual ~streambuf()
Mundane destructor freeing the allocated resources.
Definition: python_streambuf.h:235
boost_adaptbx::python::streambuf::istream::istream
istream(streambuf &buf)
Definition: python_streambuf.h:465
boost_adaptbx::python::streambuf::traits_type_eof
static int traits_type_eof()
Definition: python_streambuf.h:128
BoostEndInclude.h
boost_adaptbx::python::streambuf::char_type
base_t::char_type char_type
Definition: python_streambuf.h:121
boost_adaptbx::python::ostream::ostream
ostream(bp::object &python_file_obj, std::size_t buffer_size=0)
Definition: python_streambuf.h:501
TEST_ASSERT
#define TEST_ASSERT(expr)
Definition: Invariant.h:152
boost_adaptbx
Definition: python_streambuf.h:34
boost_adaptbx::python::streambuf::showmanyc
virtual std::streamsize showmanyc()
C.f. C++ standard section 27.5.2.4.3.
Definition: python_streambuf.h:243
boost_adaptbx::python::streambuf::streambuf
streambuf(bp::object &python_file_obj, std::size_t buffer_size_=0)
Construct from a Python file object.
Definition: python_streambuf.h:139
boost_adaptbx::python::streambuf::underflow
virtual int_type underflow()
C.f. C++ standard section 27.5.2.4.3.
Definition: python_streambuf.h:251
ValueErrorException
Class to allow us to throw a ValueError from C++ and have it make it back to Python.
Definition: Exceptions.h:33
Invariant.h
boost_adaptbx::python::streambuf::istream
Definition: python_streambuf.h:463
boost_adaptbx::python::ostream::~ostream
~ostream()
Definition: python_streambuf.h:505
boost_adaptbx::python::streambuf::pos_type
base_t::pos_type pos_type
Definition: python_streambuf.h:123
boost_adaptbx::python::streambuf::overflow
virtual int_type overflow(int_type c=traits_type_eof())
C.f. C++ standard section 27.5.2.4.5.
Definition: python_streambuf.h:276
boost_adaptbx::python::streambuf::int_type
base_t::int_type int_type
Definition: python_streambuf.h:122
boost_adaptbx::python::streambuf_capsule
Definition: python_streambuf.h:493
boost_adaptbx::python::streambuf::ostream
Definition: python_streambuf.h:479
boost_adaptbx::python::streambuf::ostream::~ostream
~ostream()
Definition: python_streambuf.h:485
boost_adaptbx::python::streambuf::default_buffer_size
const static std::size_t default_buffer_size
The default size of the read and write buffer.
Definition: python_streambuf.h:134
boost_adaptbx::python::streambuf::ostream::ostream
ostream(streambuf &buf)
Definition: python_streambuf.h:481
boost_adaptbx::python::streambuf::off_type
base_t::off_type off_type
Definition: python_streambuf.h:124
boost_adaptbx::python::streambuf::sync
virtual int sync()
Update the python file to reflect the state of this stream buffer.
Definition: python_streambuf.h:307
boost_adaptbx::python::streambuf::istream::~istream
~istream()
Definition: python_streambuf.h:469
Exceptions.h
boost_adaptbx::python::streambuf_capsule::streambuf_capsule
streambuf_capsule(bp::object &python_file_obj, std::size_t buffer_size=0)
Definition: python_streambuf.h:496
export.h