SqlTokenizer.php 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395
  1. <?php
  2. /**
  3. * @link http://www.yiiframework.com/
  4. * @copyright Copyright (c) 2008 Yii Software LLC
  5. * @license http://www.yiiframework.com/license/
  6. */
  7. namespace yii\db;
  8. use yii\base\Component;
  9. use yii\base\InvalidArgumentException;
  10. /**
  11. * SqlTokenizer splits an SQL query into individual SQL tokens.
  12. *
  13. * It can be used to obtain an addition information from an SQL code.
  14. *
  15. * Usage example:
  16. *
  17. * ```php
  18. * $tokenizer = new SqlTokenizer("SELECT * FROM user WHERE id = 1");
  19. * $root = $tokeinzer->tokenize();
  20. * $sqlTokens = $root->getChildren();
  21. * ```
  22. *
  23. * Tokens are instances of [[SqlToken]].
  24. *
  25. * @author Sergey Makinen <sergey@makinen.ru>
  26. * @since 2.0.13
  27. */
  28. abstract class SqlTokenizer extends Component
  29. {
  30. /**
  31. * @var string SQL code.
  32. */
  33. public $sql;
  34. /**
  35. * @var int SQL code string length.
  36. */
  37. protected $length;
  38. /**
  39. * @var int SQL code string current offset.
  40. */
  41. protected $offset;
  42. /**
  43. * @var \SplStack stack of active tokens.
  44. */
  45. private $_tokenStack;
  46. /**
  47. * @var SqlToken active token. It's usually a top of the token stack.
  48. */
  49. private $_currentToken;
  50. /**
  51. * @var string[] cached substrings.
  52. */
  53. private $_substrings;
  54. /**
  55. * @var string current buffer value.
  56. */
  57. private $_buffer = '';
  58. /**
  59. * @var SqlToken resulting token of a last [[tokenize()]] call.
  60. */
  61. private $_token;
  62. /**
  63. * Constructor.
  64. * @param string $sql SQL code to be tokenized.
  65. * @param array $config name-value pairs that will be used to initialize the object properties
  66. */
  67. public function __construct($sql, $config = [])
  68. {
  69. $this->sql = $sql;
  70. parent::__construct($config);
  71. }
  72. /**
  73. * Tokenizes and returns a code type token.
  74. * @return SqlToken code type token.
  75. */
  76. public function tokenize()
  77. {
  78. $this->length = mb_strlen($this->sql, 'UTF-8');
  79. $this->offset = 0;
  80. $this->_substrings = [];
  81. $this->_buffer = '';
  82. $this->_token = new SqlToken([
  83. 'type' => SqlToken::TYPE_CODE,
  84. 'content' => $this->sql,
  85. ]);
  86. $this->_tokenStack = new \SplStack();
  87. $this->_tokenStack->push($this->_token);
  88. $this->_token[] = new SqlToken(['type' => SqlToken::TYPE_STATEMENT]);
  89. $this->_tokenStack->push($this->_token[0]);
  90. $this->_currentToken = $this->_tokenStack->top();
  91. while (!$this->isEof()) {
  92. if ($this->isWhitespace($length) || $this->isComment($length)) {
  93. $this->addTokenFromBuffer();
  94. $this->advance($length);
  95. continue;
  96. }
  97. if ($this->tokenizeOperator($length) || $this->tokenizeDelimitedString($length)) {
  98. $this->advance($length);
  99. continue;
  100. }
  101. $this->_buffer .= $this->substring(1);
  102. $this->advance(1);
  103. }
  104. $this->addTokenFromBuffer();
  105. if ($this->_token->getHasChildren() && !$this->_token[-1]->getHasChildren()) {
  106. unset($this->_token[-1]);
  107. }
  108. return $this->_token;
  109. }
  110. /**
  111. * Returns whether there's a whitespace at the current offset.
  112. * If this methos returns `true`, it has to set the `$length` parameter to the length of the matched string.
  113. * @param int $length length of the matched string.
  114. * @return bool whether there's a whitespace at the current offset.
  115. */
  116. abstract protected function isWhitespace(&$length);
  117. /**
  118. * Returns whether there's a commentary at the current offset.
  119. * If this methos returns `true`, it has to set the `$length` parameter to the length of the matched string.
  120. * @param int $length length of the matched string.
  121. * @return bool whether there's a commentary at the current offset.
  122. */
  123. abstract protected function isComment(&$length);
  124. /**
  125. * Returns whether there's an operator at the current offset.
  126. * If this methos returns `true`, it has to set the `$length` parameter to the length of the matched string.
  127. * It may also set `$content` to a string that will be used as a token content.
  128. * @param int $length length of the matched string.
  129. * @param string $content optional content instead of the matched string.
  130. * @return bool whether there's an operator at the current offset.
  131. */
  132. abstract protected function isOperator(&$length, &$content);
  133. /**
  134. * Returns whether there's an identifier at the current offset.
  135. * If this methos returns `true`, it has to set the `$length` parameter to the length of the matched string.
  136. * It may also set `$content` to a string that will be used as a token content.
  137. * @param int $length length of the matched string.
  138. * @param string $content optional content instead of the matched string.
  139. * @return bool whether there's an identifier at the current offset.
  140. */
  141. abstract protected function isIdentifier(&$length, &$content);
  142. /**
  143. * Returns whether there's a string literal at the current offset.
  144. * If this methos returns `true`, it has to set the `$length` parameter to the length of the matched string.
  145. * It may also set `$content` to a string that will be used as a token content.
  146. * @param int $length length of the matched string.
  147. * @param string $content optional content instead of the matched string.
  148. * @return bool whether there's a string literal at the current offset.
  149. */
  150. abstract protected function isStringLiteral(&$length, &$content);
  151. /**
  152. * Returns whether the given string is a keyword.
  153. * The method may set `$content` to a string that will be used as a token content.
  154. * @param string $string string to be matched.
  155. * @param string $content optional content instead of the matched string.
  156. * @return bool whether the given string is a keyword.
  157. */
  158. abstract protected function isKeyword($string, &$content);
  159. /**
  160. * Returns whether the longest common prefix equals to the SQL code of the same length at the current offset.
  161. * @param string[] $with strings to be tested.
  162. * The method **will** modify this parameter to speed up lookups.
  163. * @param bool $caseSensitive whether to perform a case sensitive comparison.
  164. * @param int|null $length length of the matched string.
  165. * @param string|null $content matched string.
  166. * @return bool whether a match is found.
  167. */
  168. protected function startsWithAnyLongest(array &$with, $caseSensitive, &$length = null, &$content = null)
  169. {
  170. if (empty($with)) {
  171. return false;
  172. }
  173. if (!is_array(reset($with))) {
  174. usort($with, function ($string1, $string2) {
  175. return mb_strlen($string2, 'UTF-8') - mb_strlen($string1, 'UTF-8');
  176. });
  177. $map = [];
  178. foreach ($with as $string) {
  179. $map[mb_strlen($string, 'UTF-8')][$caseSensitive ? $string : mb_strtoupper($string, 'UTF-8')] = true;
  180. }
  181. $with = $map;
  182. }
  183. foreach ($with as $testLength => $testValues) {
  184. $content = $this->substring($testLength, $caseSensitive);
  185. if (isset($testValues[$content])) {
  186. $length = $testLength;
  187. return true;
  188. }
  189. }
  190. return false;
  191. }
  192. /**
  193. * Returns a string of the given length starting with the specified offset.
  194. * @param int $length string length to be returned.
  195. * @param bool $caseSensitive if it's `false`, the string will be uppercased.
  196. * @param int|null $offset SQL code offset, defaults to current if `null` is passed.
  197. * @return string result string, it may be empty if there's nothing to return.
  198. */
  199. protected function substring($length, $caseSensitive = true, $offset = null)
  200. {
  201. if ($offset === null) {
  202. $offset = $this->offset;
  203. }
  204. if ($offset + $length > $this->length) {
  205. return '';
  206. }
  207. $cacheKey = $offset . ',' . $length;
  208. if (!isset($this->_substrings[$cacheKey . ',1'])) {
  209. $this->_substrings[$cacheKey . ',1'] = mb_substr($this->sql, $offset, $length, 'UTF-8');
  210. }
  211. if (!$caseSensitive && !isset($this->_substrings[$cacheKey . ',0'])) {
  212. $this->_substrings[$cacheKey . ',0'] = mb_strtoupper($this->_substrings[$cacheKey . ',1'], 'UTF-8');
  213. }
  214. return $this->_substrings[$cacheKey . ',' . (int) $caseSensitive];
  215. }
  216. /**
  217. * Returns an index after the given string in the SQL code starting with the specified offset.
  218. * @param string $string string to be found.
  219. * @param int|null $offset SQL code offset, defaults to current if `null` is passed.
  220. * @return int index after the given string or end of string index.
  221. */
  222. protected function indexAfter($string, $offset = null)
  223. {
  224. if ($offset === null) {
  225. $offset = $this->offset;
  226. }
  227. if ($offset + mb_strlen($string, 'UTF-8') > $this->length) {
  228. return $this->length;
  229. }
  230. $afterIndexOf = mb_strpos($this->sql, $string, $offset, 'UTF-8');
  231. if ($afterIndexOf === false) {
  232. $afterIndexOf = $this->length;
  233. } else {
  234. $afterIndexOf += mb_strlen($string, 'UTF-8');
  235. }
  236. return $afterIndexOf;
  237. }
  238. /**
  239. * Determines whether there is a delimited string at the current offset and adds it to the token children.
  240. * @param int $length
  241. * @return bool
  242. */
  243. private function tokenizeDelimitedString(&$length)
  244. {
  245. $isIdentifier = $this->isIdentifier($length, $content);
  246. $isStringLiteral = !$isIdentifier && $this->isStringLiteral($length, $content);
  247. if (!$isIdentifier && !$isStringLiteral) {
  248. return false;
  249. }
  250. $this->addTokenFromBuffer();
  251. $this->_currentToken[] = new SqlToken([
  252. 'type' => $isIdentifier ? SqlToken::TYPE_IDENTIFIER : SqlToken::TYPE_STRING_LITERAL,
  253. 'content' => is_string($content) ? $content : $this->substring($length),
  254. 'startOffset' => $this->offset,
  255. 'endOffset' => $this->offset + $length,
  256. ]);
  257. return true;
  258. }
  259. /**
  260. * Determines whether there is an operator at the current offset and adds it to the token children.
  261. * @param int $length
  262. * @return bool
  263. */
  264. private function tokenizeOperator(&$length)
  265. {
  266. if (!$this->isOperator($length, $content)) {
  267. return false;
  268. }
  269. $this->addTokenFromBuffer();
  270. switch ($this->substring($length)) {
  271. case '(':
  272. $this->_currentToken[] = new SqlToken([
  273. 'type' => SqlToken::TYPE_OPERATOR,
  274. 'content' => is_string($content) ? $content : $this->substring($length),
  275. 'startOffset' => $this->offset,
  276. 'endOffset' => $this->offset + $length,
  277. ]);
  278. $this->_currentToken[] = new SqlToken(['type' => SqlToken::TYPE_PARENTHESIS]);
  279. $this->_tokenStack->push($this->_currentToken[-1]);
  280. $this->_currentToken = $this->_tokenStack->top();
  281. break;
  282. case ')':
  283. $this->_tokenStack->pop();
  284. $this->_currentToken = $this->_tokenStack->top();
  285. $this->_currentToken[] = new SqlToken([
  286. 'type' => SqlToken::TYPE_OPERATOR,
  287. 'content' => ')',
  288. 'startOffset' => $this->offset,
  289. 'endOffset' => $this->offset + $length,
  290. ]);
  291. break;
  292. case ';':
  293. if (!$this->_currentToken->getHasChildren()) {
  294. break;
  295. }
  296. $this->_currentToken[] = new SqlToken([
  297. 'type' => SqlToken::TYPE_OPERATOR,
  298. 'content' => is_string($content) ? $content : $this->substring($length),
  299. 'startOffset' => $this->offset,
  300. 'endOffset' => $this->offset + $length,
  301. ]);
  302. $this->_tokenStack->pop();
  303. $this->_currentToken = $this->_tokenStack->top();
  304. $this->_currentToken[] = new SqlToken(['type' => SqlToken::TYPE_STATEMENT]);
  305. $this->_tokenStack->push($this->_currentToken[-1]);
  306. $this->_currentToken = $this->_tokenStack->top();
  307. break;
  308. default:
  309. $this->_currentToken[] = new SqlToken([
  310. 'type' => SqlToken::TYPE_OPERATOR,
  311. 'content' => is_string($content) ? $content : $this->substring($length),
  312. 'startOffset' => $this->offset,
  313. 'endOffset' => $this->offset + $length,
  314. ]);
  315. break;
  316. }
  317. return true;
  318. }
  319. /**
  320. * Determines a type of text in the buffer, tokenizes it and adds it to the token children.
  321. */
  322. private function addTokenFromBuffer()
  323. {
  324. if ($this->_buffer === '') {
  325. return;
  326. }
  327. $isKeyword = $this->isKeyword($this->_buffer, $content);
  328. $this->_currentToken[] = new SqlToken([
  329. 'type' => $isKeyword ? SqlToken::TYPE_KEYWORD : SqlToken::TYPE_TOKEN,
  330. 'content' => is_string($content) ? $content : $this->_buffer,
  331. 'startOffset' => $this->offset - mb_strlen($this->_buffer, 'UTF-8'),
  332. 'endOffset' => $this->offset,
  333. ]);
  334. $this->_buffer = '';
  335. }
  336. /**
  337. * Adds the specified length to the current offset.
  338. * @param int $length
  339. * @throws InvalidArgumentException
  340. */
  341. private function advance($length)
  342. {
  343. if ($length <= 0) {
  344. throw new InvalidArgumentException('Length must be greater than 0.');
  345. }
  346. $this->offset += $length;
  347. $this->_substrings = [];
  348. }
  349. /**
  350. * Returns whether the SQL code is completely traversed.
  351. * @return bool
  352. */
  353. private function isEof()
  354. {
  355. return $this->offset >= $this->length;
  356. }
  357. }