WebSocket++  0.8.1
C++ websocket client/server library
utf8_validator.hpp
1 /*
2  * The following code is adapted from code originally written by Bjoern
3  * Hoehrmann <bjoern@hoehrmann.de>. See
4  * http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
5  *
6  * The original license:
7  *
8  * Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
9  *
10  * Permission is hereby granted, free of charge, to any person obtaining a copy
11  * of this software and associated documentation files (the "Software"), to deal
12  * in the Software without restriction, including without limitation the rights
13  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14  * copies of the Software, and to permit persons to whom the Software is
15  * furnished to do so, subject to the following conditions:
16  *
17  * The above copyright notice and this permission notice shall be included in
18  * all copies or substantial portions of the Software.
19  *
20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
23  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
26  * SOFTWARE.
27 */
28 
29 #ifndef UTF8_VALIDATOR_HPP
30 #define UTF8_VALIDATOR_HPP
31 
32 #include <websocketpp/common/stdint.hpp>
33 
34 #include <string>
35 
36 namespace websocketpp {
37 namespace utf8_validator {
38 
39 /// State that represents a valid utf8 input sequence
40 static unsigned int const utf8_accept = 0;
41 /// State that represents an invalid utf8 input sequence
42 static unsigned int const utf8_reject = 1;
43 
44 /// Lookup table for the UTF8 decode state machine
45 static uint8_t const utf8d[] = {
46  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f
47  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f
48  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 40..5f
49  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 60..7f
50  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, // 80..9f
51  7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, // a0..bf
52  8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // c0..df
53  0xa,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x4,0x3,0x3, // e0..ef
54  0xb,0x6,0x6,0x6,0x5,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8, // f0..ff
55  0x0,0x1,0x2,0x3,0x5,0x8,0x7,0x1,0x1,0x1,0x4,0x6,0x1,0x1,0x1,0x1, // s0..s0
56  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,1,0,1,1,1,1,1,1, // s1..s2
57  1,2,1,1,1,1,1,2,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1, // s3..s4
58  1,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,3,1,3,1,1,1,1,1,1, // s5..s6
59  1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // s7..s8
60 };
61 
62 /// Decode the next byte of a UTF8 sequence
63 /**
64  * @param [out] state The decoder state to advance
65  * @param [out] codep The codepoint to fill in
66  * @param [in] byte The byte to input
67  * @return The ending state of the decode operation
68  */
69 inline uint32_t decode(uint32_t * state, uint32_t * codep, uint8_t byte) {
70  uint32_t type = utf8d[byte];
71 
72  *codep = (*state != utf8_accept) ?
73  (byte & 0x3fu) | (*codep << 6) :
74  (0xff >> type) & (byte);
75 
76  *state = utf8d[256 + *state*16 + type];
77  return *state;
78 }
79 
80 /// Provides streaming UTF8 validation functionality
81 class validator {
82 public:
83  /// Construct and initialize the validator
84  validator() : m_state(utf8_accept),m_codepoint(0) {}
85 
86  /// Advance the state of the validator with the next input byte
87  /**
88  * @param byte The byte to advance the validation state with
89  * @return Whether or not the byte resulted in a validation error.
90  */
91  bool consume (uint8_t byte) {
92  if (utf8_validator::decode(&m_state,&m_codepoint,byte) == utf8_reject) {
93  return false;
94  }
95  return true;
96  }
97 
98  /// Advance validator state with input from an iterator pair
99  /**
100  * @param begin Input iterator to the start of the input range
101  * @param end Input iterator to the end of the input range
102  * @return Whether or not decoding the bytes resulted in a validation error.
103  */
104  template <typename iterator_type>
105  bool decode (iterator_type begin, iterator_type end) {
106  for (iterator_type it = begin; it != end; ++it) {
107  unsigned int result = utf8_validator::decode(
108  &m_state,
109  &m_codepoint,
110  static_cast<uint8_t>(*it)
111  );
112 
113  if (result == utf8_reject) {
114  return false;
115  }
116  }
117  return true;
118  }
119 
120  /// Return whether the input sequence ended on a valid utf8 codepoint
121  /**
122  * @return Whether or not the input sequence ended on a valid codepoint.
123  */
124  bool complete() {
125  return m_state == utf8_accept;
126  }
127 
128  /// Reset the validator to decode another message
129  void reset() {
130  m_state = utf8_accept;
131  m_codepoint = 0;
132  }
133 private:
134  uint32_t m_state;
135  uint32_t m_codepoint;
136 };
137 
138 /// Validate a UTF8 string
139 /**
140  * convenience function that creates a validator, validates a complete string
141  * and returns the result.
142  */
143 inline bool validate(std::string const & s) {
144  validator v;
145  if (!v.decode(s.begin(),s.end())) {
146  return false;
147  }
148  return v.complete();
149 }
150 
151 } // namespace utf8_validator
152 } // namespace websocketpp
153 
154 #endif // UTF8_VALIDATOR_HPP
bool decode(iterator_type begin, iterator_type end)
Advance validator state with input from an iterator pair.
Provides streaming UTF8 validation functionality.
lib::weak_ptr< void > connection_hdl
A handle to uniquely identify a connection.
bool consume(uint8_t byte)
Advance the state of the validator with the next input byte.
void reset()
Reset the validator to decode another message.
bool complete()
Return whether the input sequence ended on a valid utf8 codepoint.
validator()
Construct and initialize the validator.