| 1 | // tRegex.cpp  |
| 2 | //  |
| 3 | // Simple regular expression class. The code is based on TRex but has been significantly modified. The original license  |
| 4 | // follows:  |
| 5 | //  |
| 6 | // Copyright (c) 2003-2006 Alberto Demichelis  |
| 7 | // This software is provided 'as-is', without any express or implied warranty. In no event will the authors be held  |
| 8 | // liable for any damages arising from the use of this software.  |
| 9 | //  |
| 10 | // Permission is granted to anyone to use this software for any purpose, including commercial applications, and to  |
| 11 | // alter it and redistribute it freely, subject to the following restrictions:  |
| 12 | //  |
| 13 | // 1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software.  |
| 14 | // If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is  |
| 15 | // not required.  |
| 16 | //  |
| 17 | // 2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original  |
| 18 | // software.  |
| 19 | //  |
| 20 | // 3. This notice may not be removed or altered from any source distribution.  |
| 21 | //  |
| 22 | // To be absolutely clear, the tRegex class found here is an 'altered source' version of the original. The alterations  |
| 23 | // are under the following license:  |
| 24 | //  |
| 25 | // Copyright (c) 2006, 2017 Tristan Grimmer.  |
| 26 | // Permission to use, copy, modify, and/or distribute this software for any purpose with or without fee is hereby  |
| 27 | // granted, provided that the above copyright notice and this permission notice appear in all copies.  |
| 28 | //  |
| 29 | // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL  |
| 30 | // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT,  |
| 31 | // INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN  |
| 32 | // AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR  |
| 33 | // PERFORMANCE OF THIS SOFTWARE.  |
| 34 |   |
| 35 | #include <Foundation/tMemory.h>  |
| 36 | #include "System/tThrow.h"  |
| 37 | #include "System/tRegex.h"  |
| 38 | #include "System/tPrint.h"  |
| 39 | using namespace tStd;  |
| 40 | using namespace tMem;  |
| 41 | //#define REGEX_DEBUG_PRINT  |
| 42 | namespace tSystem  |
| 43 | {  |
| 44 |   |
| 45 |   |
| 46 | enum tOperator  |
| 47 | {  |
| 48 | // Invalid is 255 on purpose.  |
| 49 | tOperator_Invalid = tCharInvalid,  |
| 50 | tOperator_First,  |
| 51 | tOperator_Greedy = tOperator_First,  |
| 52 | tOperator_Or,  |
| 53 | tOperator_Expr,  |
| 54 | tOperator_NoCapExpr,  |
| 55 | tOperator_Dot,  |
| 56 | tOperator_Class,  |
| 57 | tOperator_CClass,  |
| 58 | tOperator_NClass,  |
| 59 | tOperator_Range,  |
| 60 | tOperator_Char,  |
| 61 | tOperator_EOL,  |
| 62 | tOperator_BOL,  |
| 63 | tOperator_WB,  |
| 64 | tOperator_NumOperators = tOperator_WB - tOperator_Invalid  |
| 65 | };  |
| 66 |   |
| 67 |   |
| 68 | #ifdef REGEX_DEBUG_PRINT  |
| 69 | static const char* OperatorNames[tOperator_NumOperators] =  |
| 70 | {  |
| 71 | "Greedy" ,  |
| 72 | "Or" ,  |
| 73 | "Expr" ,  |
| 74 | "NoCapExpr" ,  |
| 75 | "Dot" ,  |
| 76 | "Class" ,  |
| 77 | "CClass" ,  |
| 78 | "NClass" ,  |
| 79 | "Range" ,  |
| 80 | "Char" ,  |
| 81 | "EOL" ,  |
| 82 | "BOL" ,  |
| 83 | "WB"   |
| 84 | };  |
| 85 | #endif  |
| 86 |   |
| 87 |   |
| 88 | const static char iSymbol_AnyChar = '.';  |
| 89 | const static char iSymbol_GreedyOneOrMore = '+';  |
| 90 | const static char iSymbol_GreedyZeroOrMore = '*';  |
| 91 | const static char iSymbol_GreedyZeroOrOne = '?';  |
| 92 | const static char iSymbol_Branch = '|';  |
| 93 | const static char iSymbol_StringEnd = '$';  |
| 94 | const static char iSymbol_StringBegin = '^';  |
| 95 | const static char iSymbol_Escape = '\\';  |
| 96 |   |
| 97 |   |
| 98 | int tRegex::NewNode(int type)  |
| 99 | {  |
| 100 | tRegex::Node n;  |
| 101 | n.Type = type;  |
| 102 | n.Next = n.Right = n.Left = -1;  |
| 103 |   |
| 104 | if (type == tOperator_Expr)  |
| 105 | n.Right = NumSubExpr++;  |
| 106 |   |
| 107 | if (NumNodesAllocated < (NumNodes + 1))  |
| 108 | {  |
| 109 | int origSize = NumNodesAllocated * sizeof(tRegex::Node);  |
| 110 | NumNodesAllocated *= 2;  |
| 111 | tRegex::Node* newNodes = (tRegex::Node*)tMalloc(NumNodesAllocated * sizeof(tRegex::Node));  |
| 112 | if (Nodes)  |
| 113 | {  |
| 114 | tMemcpy(newNodes, Nodes, origSize);  |
| 115 | tFree(Nodes);  |
| 116 | }  |
| 117 | Nodes = newNodes;  |
| 118 | }  |
| 119 |   |
| 120 | Nodes[NumNodes++] = n;  |
| 121 | int newid = NumNodes - 1;  |
| 122 | return newid;  |
| 123 | }  |
| 124 |   |
| 125 |   |
| 126 | void tRegex::Expect(int n)  |
| 127 | {  |
| 128 | if ((*Curr) != n)   |
| 129 | throw tError("Expected bracket." );  |
| 130 |   |
| 131 | Curr++;  |
| 132 | }  |
| 133 |   |
| 134 |   |
| 135 | char tRegex::EscapeChar()  |
| 136 | {  |
| 137 | if (*Curr == iSymbol_Escape)  |
| 138 | {  |
| 139 | Curr++;  |
| 140 | switch (*Curr)  |
| 141 | {  |
| 142 | case 'v':  |
| 143 | Curr++;  |
| 144 | return '\v';  |
| 145 |   |
| 146 | case 'n':  |
| 147 | Curr++;  |
| 148 | return '\n';  |
| 149 |   |
| 150 | case 't':  |
| 151 | Curr++;  |
| 152 | return '\t';  |
| 153 |   |
| 154 | case 'r':  |
| 155 | Curr++;  |
| 156 | return '\r';  |
| 157 |   |
| 158 | case 'f':  |
| 159 | Curr++;  |
| 160 | return '\f';  |
| 161 |   |
| 162 | default:  |
| 163 | return (*Curr++);  |
| 164 | }  |
| 165 | }  |
| 166 | else if (!tIsprint(*Curr))  |
| 167 | throw tError("Letter expected." );  |
| 168 |   |
| 169 | return (*Curr++);  |
| 170 | }  |
| 171 |   |
| 172 |   |
| 173 | int tRegex::CharClass(int classid)  |
| 174 | {  |
| 175 | int n = NewNode(tOperator_CClass);  |
| 176 | Nodes[n].Left = classid;  |
| 177 | return n;  |
| 178 | }  |
| 179 |   |
| 180 |   |
| 181 | int tRegex::CharNode(bool isclass)  |
| 182 | {  |
| 183 | if (*Curr == iSymbol_Escape)  |
| 184 | {  |
| 185 | Curr++;  |
| 186 | switch (*Curr)  |
| 187 | {  |
| 188 | case 'n':  |
| 189 | Curr++;  |
| 190 | return NewNode('\n');  |
| 191 |   |
| 192 | case 't':  |
| 193 | Curr++;  |
| 194 | return NewNode('\t');  |
| 195 |   |
| 196 | case 'r':  |
| 197 | Curr++;  |
| 198 | return NewNode('\r');  |
| 199 |   |
| 200 | case 'f':  |
| 201 | Curr++;  |
| 202 | return NewNode('\f');  |
| 203 |   |
| 204 | case 'v':  |
| 205 | Curr++;  |
| 206 | return NewNode('\v');  |
| 207 |   |
| 208 | case 'a': case 'A': case 'w': case 'W':  |
| 209 | case 's': case 'S': case 'd': case 'D':  |
| 210 | case 'x': case 'X': case 'c': case 'C':  |
| 211 | case 'p': case 'P': case 'l': case 'u':  |
| 212 | {  |
| 213 | char t = *Curr;  |
| 214 | Curr++;  |
| 215 | return CharClass(t);  |
| 216 | }  |
| 217 |   |
| 218 | case 'b':  |
| 219 | case 'B':  |
| 220 | if (!isclass)  |
| 221 | {  |
| 222 | int node = NewNode(tOperator_WB);  |
| 223 | Nodes[node].Left = *Curr;  |
| 224 | Curr++;   |
| 225 | return node;  |
| 226 | }  |
| 227 |   |
| 228 | default:  |
| 229 | {  |
| 230 | char t = *Curr;  |
| 231 | Curr++;  |
| 232 | return NewNode(t);  |
| 233 | }  |
| 234 | }  |
| 235 | }  |
| 236 | else if (!tIsprint(*Curr))  |
| 237 | {  |
| 238 | throw tError("Letter expected." );  |
| 239 | }  |
| 240 |   |
| 241 | char t = *Curr;  |
| 242 | Curr++;   |
| 243 | return NewNode(t);  |
| 244 | }  |
| 245 |   |
| 246 |   |
| 247 | int tRegex::Class()  |
| 248 | {  |
| 249 | int ret = -1;  |
| 250 | if (*Curr == iSymbol_StringBegin)  |
| 251 | {  |
| 252 | ret = NewNode(tOperator_NClass);  |
| 253 | Curr++;  |
| 254 | }  |
| 255 | else  |
| 256 | ret = NewNode(tOperator_Class);  |
| 257 |   |
| 258 | if (*Curr == ']')  |
| 259 | throw tError("Empty class." );  |
| 260 |   |
| 261 | int first = -1;  |
| 262 | int chain = ret;  |
| 263 | while (*Curr != ']' && Curr != EOL)  |
| 264 | {  |
| 265 | if (*Curr == '-' && first != -1)  |
| 266 | {  |
| 267 | int r;  |
| 268 | if (*Curr++ == ']')  |
| 269 | throw tError("Unfinished range." );  |
| 270 |   |
| 271 | r = NewNode(tOperator_Range);  |
| 272 | if (first > *Curr)  |
| 273 | throw tError("Invalid range." );  |
| 274 |   |
| 275 | if (Nodes[first].Type == tOperator_CClass)  |
| 276 | throw tError("Cannot use character classes in ranges." );  |
| 277 |   |
| 278 | Nodes[r].Left = Nodes[first].Type;  |
| 279 | int t = EscapeChar();  |
| 280 | Nodes[r].Right = t;  |
| 281 | Nodes[chain].Next = r;  |
| 282 | chain = r;  |
| 283 | first = -1;  |
| 284 | }  |
| 285 | else  |
| 286 | {  |
| 287 | if (first != -1)  |
| 288 | {  |
| 289 | int c = first;  |
| 290 | Nodes[chain].Next = c;  |
| 291 | chain = c;  |
| 292 | }  |
| 293 | first = CharNode(true);  |
| 294 | }  |
| 295 | }  |
| 296 |   |
| 297 | if (first != -1)  |
| 298 | {  |
| 299 | int c = first;  |
| 300 | Nodes[chain].Next = c;  |
| 301 | chain = c;  |
| 302 | first = -1;  |
| 303 | }  |
| 304 |   |
| 305 | // A hack?  |
| 306 | Nodes[ret].Left = Nodes[ret].Next;  |
| 307 | Nodes[ret].Next = -1;  |
| 308 | return ret;  |
| 309 | }  |
| 310 |   |
| 311 |   |
| 312 | int tRegex::ParseNumber()  |
| 313 | {  |
| 314 | int ret = *Curr - '0';  |
| 315 | int positions = 10;  |
| 316 | Curr++;  |
| 317 |   |
| 318 | while (tIsdigit(*Curr))  |
| 319 | {  |
| 320 | ret = ret*10 + (*Curr++ - '0');  |
| 321 | if (positions == 1000000000)  |
| 322 | throw tError("Overflow in numeric constant." );  |
| 323 |   |
| 324 | positions *= 10;  |
| 325 | };  |
| 326 |   |
| 327 | return ret;  |
| 328 | }  |
| 329 |   |
| 330 |   |
| 331 | int tRegex::Element()  |
| 332 | {  |
| 333 | int ret = -1;  |
| 334 | switch (*Curr)  |
| 335 | {  |
| 336 | case '(':  |
| 337 | {  |
| 338 | int expr;  |
| 339 | Curr++;  |
| 340 |   |
| 341 | if (*Curr =='?')  |
| 342 | {  |
| 343 | Curr++;  |
| 344 | Expect(':');  |
| 345 | expr = NewNode(tOperator_NoCapExpr);  |
| 346 | }  |
| 347 | else  |
| 348 | expr = NewNode(tOperator_Expr);  |
| 349 |   |
| 350 | int newn = ListRec();  |
| 351 | Nodes[expr].Left = newn;  |
| 352 | ret = expr;  |
| 353 | Expect(')');  |
| 354 | break;  |
| 355 | }  |
| 356 |   |
| 357 | case '[':  |
| 358 | Curr++;  |
| 359 | ret = Class();  |
| 360 | Expect(']');  |
| 361 | break;  |
| 362 |   |
| 363 | case iSymbol_StringEnd:  |
| 364 | Curr++;  |
| 365 | ret = NewNode(tOperator_EOL);  |
| 366 | break;  |
| 367 |   |
| 368 | case iSymbol_AnyChar:  |
| 369 | Curr++;  |
| 370 | ret = NewNode(tOperator_Dot);  |
| 371 | break;  |
| 372 |   |
| 373 | default:  |
| 374 | ret = CharNode(false);  |
| 375 | break;  |
| 376 | }  |
| 377 |   |
| 378 | int op;  |
| 379 | bool isgreedy = false;  |
| 380 | uint16 p0 = 0, p1 = 0;  |
| 381 | switch (*Curr)  |
| 382 | {  |
| 383 | case iSymbol_GreedyZeroOrMore:  |
| 384 | p0 = 0;  |
| 385 | p1 = 0xFFFF;  |
| 386 | Curr++;  |
| 387 | isgreedy = true;  |
| 388 | break;  |
| 389 |   |
| 390 | case iSymbol_GreedyOneOrMore:  |
| 391 | p0 = 1;  |
| 392 | p1 = 0xFFFF;  |
| 393 | Curr++;  |
| 394 | isgreedy = true;  |
| 395 | break;  |
| 396 |   |
| 397 | case iSymbol_GreedyZeroOrOne:  |
| 398 | p0 = 0;  |
| 399 | p1 = 1;  |
| 400 | Curr++;  |
| 401 | isgreedy = true;  |
| 402 | break;  |
| 403 |   |
| 404 | case '{':  |
| 405 | Curr++;  |
| 406 | if (!tIsdigit(*Curr))  |
| 407 | throw tError("Number expected." );  |
| 408 |   |
| 409 | p0 = uint16(ParseNumber());  |
| 410 | switch (*Curr)  |
| 411 | {  |
| 412 | case '}':  |
| 413 | p1 = p0;  |
| 414 | Curr++;  |
| 415 | break;  |
| 416 |   |
| 417 | case ',':  |
| 418 | Curr++;  |
| 419 | p1 = 0xFFFF;  |
| 420 | if (tIsdigit(*Curr))  |
| 421 | p1 = uint16(ParseNumber());  |
| 422 | Expect('}');  |
| 423 | break;  |
| 424 |   |
| 425 | default:  |
| 426 | throw tError("A , or } was expected." );  |
| 427 | }  |
| 428 |   |
| 429 | isgreedy = true;   |
| 430 | break;  |
| 431 | }  |
| 432 |   |
| 433 | if (isgreedy)  |
| 434 | {  |
| 435 | int nnode = NewNode(tOperator_Greedy);  |
| 436 | op = tOperator_Greedy;  |
| 437 | Nodes[nnode].Left = ret;  |
| 438 | Nodes[nnode].Right = ((p0)<<16) | p1;  |
| 439 | ret = nnode;  |
| 440 | }  |
| 441 |   |
| 442 | if  |
| 443 | (  |
| 444 | (*Curr != iSymbol_Branch) && (*Curr != ')') &&  |
| 445 | (*Curr != iSymbol_GreedyZeroOrMore) && (*Curr != iSymbol_GreedyOneOrMore) &&  |
| 446 | (*Curr != '\0')  |
| 447 | )  |
| 448 | {  |
| 449 | int nnode = Element();  |
| 450 | Nodes[ret].Next = nnode;  |
| 451 | }  |
| 452 |   |
| 453 | return ret;  |
| 454 | }  |
| 455 |   |
| 456 |   |
| 457 | int tRegex::ListRec()  |
| 458 | {  |
| 459 | int ret = -1, e;  |
| 460 | if (*Curr == iSymbol_StringBegin)  |
| 461 | {  |
| 462 | Curr++;  |
| 463 | ret = NewNode(tOperator_BOL);  |
| 464 | }  |
| 465 |   |
| 466 | e = Element();  |
| 467 | if (ret != -1)  |
| 468 | Nodes[ret].Next = e;  |
| 469 | else  |
| 470 | ret = e;  |
| 471 |   |
| 472 | if (*Curr == iSymbol_Branch)  |
| 473 | {  |
| 474 | int temp, tright;  |
| 475 | Curr++;  |
| 476 | temp = NewNode(tOperator_Or);  |
| 477 | Nodes[temp].Left = ret;  |
| 478 | tright = ListRec();  |
| 479 | Nodes[temp].Right = tright;  |
| 480 | ret = temp;  |
| 481 | }  |
| 482 |   |
| 483 | return ret;  |
| 484 | }  |
| 485 |   |
| 486 |   |
| 487 | bool tRegex::MatchCClass(int cclass, char c)  |
| 488 | {  |
| 489 | switch (cclass)  |
| 490 | {  |
| 491 | case 'a':  |
| 492 | return tIsalpha(c) ? true : false;  |
| 493 |   |
| 494 | case 'A':  |
| 495 | return !tIsalpha(c) ? true : false;  |
| 496 |   |
| 497 | case 'w':  |
| 498 | return (tIsalnum(c) || c == '_') ? true : false;  |
| 499 |   |
| 500 | case 'W':  |
| 501 | return (!tIsalnum(c) && c != '_') ? true : false;  |
| 502 |   |
| 503 | case 's':  |
| 504 | return tIsspace(c) ? true : false;  |
| 505 |   |
| 506 | case 'S':  |
| 507 | return !tIsspace(c) ? true : false;  |
| 508 |   |
| 509 | case 'd':  |
| 510 | return tIsdigit(c) ? true : false;  |
| 511 |   |
| 512 | case 'D':  |
| 513 | return !tIsdigit(c) ? true : false;  |
| 514 |   |
| 515 | case 'x':  |
| 516 | return tIsxdigit(c) ? true : false;  |
| 517 |   |
| 518 | case 'X':  |
| 519 | return !tIsxdigit(c) ? true : false;  |
| 520 |   |
| 521 | case 'c':  |
| 522 | return tIscntrl(c) ? true : false;  |
| 523 |   |
| 524 | case 'C':  |
| 525 | return !tIscntrl(c) ? true : false;  |
| 526 |   |
| 527 | case 'p':  |
| 528 | return tIspunct(c) ? true : false;  |
| 529 |   |
| 530 | case 'P':  |
| 531 | return !tIspunct(c) ? true : false;  |
| 532 |   |
| 533 | case 'l':  |
| 534 | return tIslower(c) ? true : false;  |
| 535 |   |
| 536 | case 'u':  |
| 537 | return tIsupper(c) ? true : false;  |
| 538 | }  |
| 539 |   |
| 540 | return false;  |
| 541 | }  |
| 542 |   |
| 543 |   |
| 544 | bool tRegex::MatchClass(const tRegex::Node* node, char c) const  |
| 545 | {  |
| 546 | do  |
| 547 | {  |
| 548 | switch (node->Type)  |
| 549 | {  |
| 550 | case tOperator_Range:  |
| 551 | if (c >= node->Left && c <= node->Right)  |
| 552 | return true;  |
| 553 | break;  |
| 554 |   |
| 555 | case tOperator_CClass:  |
| 556 | if (MatchCClass(node->Left, c))  |
| 557 | return true;  |
| 558 | break;  |
| 559 |   |
| 560 | default:  |
| 561 | if (c == node->Type)  |
| 562 | return true;  |
| 563 | }  |
| 564 | }  |
| 565 | while ((node->Next != -1) && (node = &Nodes[node->Next]));  |
| 566 |   |
| 567 | return false;  |
| 568 | }  |
| 569 |   |
| 570 |   |
| 571 | const char* tRegex::MatchNode(const tRegex::Node* node, const char* str, const tRegex::Node* next) const  |
| 572 | {  |
| 573 | int type = node->Type;  |
| 574 | switch (type)  |
| 575 | {  |
| 576 | case tOperator_Greedy:  |
| 577 | {  |
| 578 | const tRegex::Node* greedystop = 0;  |
| 579 | int p0 = (node->Right >> 16) & 0x0000FFFF, p1 = node->Right & 0x0000FFFF, nmaches = 0;  |
| 580 | const char* s = str, *good = str;  |
| 581 |   |
| 582 | if (node->Next != -1)  |
| 583 | greedystop = &Nodes[node->Next];  |
| 584 | else  |
| 585 | greedystop = next;  |
| 586 |   |
| 587 | while ((nmaches == 0xFFFF || nmaches < p1))  |
| 588 | {  |
| 589 | if (!(s = MatchNode(&Nodes[node->Left], s, greedystop)))  |
| 590 | break;  |
| 591 |   |
| 592 | nmaches++;  |
| 593 | good = s;  |
| 594 | if (greedystop)  |
| 595 | {  |
| 596 | // Checks that 0 matches satisfy the expression and if so skips.  |
| 597 | // If not would always stop (for instance if is a '?').  |
| 598 | if  |
| 599 | (  |
| 600 | greedystop->Type != tOperator_Greedy ||  |
| 601 | (greedystop->Type == tOperator_Greedy && ((greedystop->Right >> 16)&0x0000FFFF) != 0)  |
| 602 | )  |
| 603 | {  |
| 604 | tRegex::Node* gnext = 0;  |
| 605 | if (greedystop->Next != -1)  |
| 606 | gnext = &Nodes[greedystop->Next];  |
| 607 | else if (next && next->Next != -1)  |
| 608 | gnext = &Nodes[next->Next];  |
| 609 |   |
| 610 | const char* stop = MatchNode(greedystop, s, gnext);  |
| 611 | if (stop)  |
| 612 | {  |
| 613 | // If satisfied stop it.  |
| 614 | if (p0 == p1 && p0 == nmaches)  |
| 615 | break;  |
| 616 | else if (nmaches >= p0 && p1 == 0xFFFF)  |
| 617 | break;  |
| 618 | else if (nmaches >= p0 && nmaches <= p1)  |
| 619 | break;  |
| 620 | }  |
| 621 | }  |
| 622 | }  |
| 623 |   |
| 624 | if (s >= EOL)  |
| 625 | break;  |
| 626 | }  |
| 627 |   |
| 628 | if (p0 == p1 && p0 == nmaches)  |
| 629 | return good;  |
| 630 |   |
| 631 | else if (nmaches >= p0 && p1 == 0xFFFF)  |
| 632 | return good;  |
| 633 |   |
| 634 | else if (nmaches >= p0 && nmaches <= p1)  |
| 635 | return good;  |
| 636 |   |
| 637 | return nullptr;  |
| 638 | }  |
| 639 |   |
| 640 | case tOperator_Or:  |
| 641 | {  |
| 642 | const char* asd = str;  |
| 643 | tRegex::Node* temp = &Nodes[node->Left];  |
| 644 | while ((asd = MatchNode(temp, asd, 0)))  |
| 645 | {  |
| 646 | if (temp->Next != -1)  |
| 647 | temp = &Nodes[temp->Next];  |
| 648 | else  |
| 649 | return asd;  |
| 650 | }  |
| 651 |   |
| 652 | asd = str;  |
| 653 | temp = &Nodes[node->Right];  |
| 654 | while ((asd = MatchNode(temp, asd, 0)))  |
| 655 | {  |
| 656 | if (temp->Next != -1)  |
| 657 | temp = &Nodes[temp->Next];  |
| 658 | else  |
| 659 | return asd;  |
| 660 | }  |
| 661 | return nullptr;  |
| 662 | }  |
| 663 |   |
| 664 | case tOperator_Expr:  |
| 665 | case tOperator_NoCapExpr:  |
| 666 | {  |
| 667 | tRegex::Node* n = &Nodes[node->Left];  |
| 668 | const char* cur = str;  |
| 669 | int capture = -1;  |
| 670 | if (node->Type != tOperator_NoCapExpr && node->Right == CurrSubExpr)  |
| 671 | {  |
| 672 | capture = CurrSubExpr;  |
| 673 | Matches[capture].Begin = cur;  |
| 674 | CurrSubExpr++;  |
| 675 | }  |
| 676 |   |
| 677 | do  |
| 678 | {  |
| 679 | const tRegex::Node* subnext = 0;  |
| 680 | if (n->Next != -1)  |
| 681 | subnext = &Nodes[n->Next];  |
| 682 | else  |
| 683 | subnext = next;  |
| 684 |   |
| 685 | if (!(cur = MatchNode(n, cur, subnext)))  |
| 686 | {  |
| 687 | if (capture != -1)  |
| 688 | {  |
| 689 | Matches[capture].Begin = 0;  |
| 690 | Matches[capture].Length = 0;  |
| 691 | }  |
| 692 | return nullptr;  |
| 693 | }  |
| 694 | }  |
| 695 | while ((n->Next != -1) && (n = &Nodes[n->Next]));  |
| 696 |   |
| 697 | if (capture != -1)  |
| 698 | Matches[capture].Length = int(cur - Matches[capture].Begin);  |
| 699 |   |
| 700 | return cur;  |
| 701 | }  |
| 702 |   |
| 703 | case tOperator_WB:  |
| 704 | if  |
| 705 | (  |
| 706 | (str == BOL && !tIsspace(*str)) || (str == EOL && !tIsspace(*(str-1))) ||  |
| 707 | (!tIsspace(*str) && tIsspace(*(str+1))) || (tIsspace(*str) && !tIsspace(*(str+1)))  |
| 708 | )  |
| 709 | {  |
| 710 | return (node->Left == 'b') ? str : 0;  |
| 711 | }  |
| 712 | return (node->Left == 'b') ? 0 : str;  |
| 713 |   |
| 714 | case tOperator_BOL:  |
| 715 | if (str == BOL)  |
| 716 | return str;  |
| 717 | return nullptr;  |
| 718 |   |
| 719 | case tOperator_EOL:  |
| 720 | if (str == EOL)  |
| 721 | return str;  |
| 722 | return nullptr;  |
| 723 |   |
| 724 | case tOperator_Dot:  |
| 725 | str++;  |
| 726 | return str;  |
| 727 |   |
| 728 | case tOperator_NClass:  |
| 729 | case tOperator_Class:  |
| 730 | if  |
| 731 | (  |
| 732 | MatchClass(&Nodes[node->Left], *str) ?  |
| 733 | (type == tOperator_Class ? true : false) :  |
| 734 | (type == tOperator_NClass ? true : false)  |
| 735 | )  |
| 736 | {  |
| 737 | str++;  |
| 738 | return str;  |
| 739 | }  |
| 740 | return nullptr;  |
| 741 |   |
| 742 | case tOperator_CClass:  |
| 743 | if (MatchCClass(node->Left, *str))  |
| 744 | {  |
| 745 | str++;  |
| 746 | return str;  |
| 747 | }  |
| 748 | return nullptr;  |
| 749 |   |
| 750 | default:  |
| 751 | if (*str != node->Type)  |
| 752 | return nullptr;  |
| 753 | str++;  |
| 754 | return str;  |
| 755 | }  |
| 756 |   |
| 757 | return nullptr;  |
| 758 | }  |
| 759 |   |
| 760 |   |
| 761 | void tRegex::CompileInternal()  |
| 762 | {  |
| 763 | tAssert(Pattern && !EOL && !BOL && !NumNodes && !Matches && !NumSubExpr);  |
| 764 | Curr = Pattern;  |
| 765 | NumNodesAllocated = tStrlen(Pattern) * sizeof(char);  |
| 766 | Nodes = (tRegex::Node*)tMalloc(NumNodesAllocated * sizeof(tRegex::Node));  |
| 767 |   |
| 768 | First = NewNode(tOperator_Expr);  |
| 769 | int res = ListRec();  |
| 770 | Nodes[First].Left = res;  |
| 771 | if (*Curr != '\0')  |
| 772 | throw tError("Unexpected character." );  |
| 773 |   |
| 774 | #ifdef REGEX_DEBUG_PRINT  |
| 775 | int nsize = NumNodes;  |
| 776 | tRegex::Node* t = &Nodes[0];  |
| 777 | tPrintf("\n" );  |
| 778 | for (int i = 0;i < nsize; i++)  |
| 779 | {  |
| 780 | if (Nodes[i].Type >= tOperator_First)  |
| 781 | tPrintf("[%02d] %10s " , i, OperatorNames[Nodes[i].Type-tOperator_First]);  |
| 782 | else  |
| 783 | tPrintf("[%02d] %10c " , i, Nodes[i].Type);  |
| 784 | tPrintf("left %02d right %02d next %02d\n" , Nodes[i].Left, Nodes[i].Right, Nodes[i].Next);  |
| 785 | }  |
| 786 | tPrintf("\n" );  |
| 787 | #endif  |
| 788 |   |
| 789 | Matches = (MatchInternal*)tMalloc(NumSubExpr * sizeof(MatchInternal));  |
| 790 | tMemset(Matches, 0, NumSubExpr * sizeof(MatchInternal));  |
| 791 | }  |
| 792 |   |
| 793 |   |
| 794 | void tRegex::Clear()  |
| 795 | {  |
| 796 | CurrSubExpr = 0;  |
| 797 | if (Matches)  |
| 798 | tFree(Matches);  |
| 799 | Matches = 0;  |
| 800 |   |
| 801 | NumSubExpr = 0;  |
| 802 | NumNodes = 0;  |
| 803 | NumNodesAllocated = 0;  |
| 804 |   |
| 805 | if (Nodes)  |
| 806 | tFree(Nodes);  |
| 807 | Nodes = 0;  |
| 808 |   |
| 809 | First = -1;  |
| 810 | Curr = 0;  |
| 811 | BOL = 0;  |
| 812 | EOL = 0;  |
| 813 |   |
| 814 | if (Pattern)  |
| 815 | tFree(Pattern);  |
| 816 | Pattern = 0;  |
| 817 | }  |
| 818 |   |
| 819 |   |
| 820 | void tRegex::Compile(const tString& pattern)  |
| 821 | {  |
| 822 | Clear();  |
| 823 | if (pattern.Length() == 0)  |
| 824 | return;  |
| 825 |   |
| 826 | Pattern = (char*)tMalloc((pattern.Length() + 1) * sizeof(char));  |
| 827 | tStrcpy(Pattern, pattern.ConstText());  |
| 828 | CompileInternal();  |
| 829 | }  |
| 830 |   |
| 831 |   |
| 832 | void tRegex::Compile(const char* pattern)  |
| 833 | {  |
| 834 | Clear();  |
| 835 | if (!pattern)  |
| 836 | return;  |
| 837 |   |
| 838 | int len = tStrlen(pattern);  |
| 839 | if (!len)  |
| 840 | return;  |
| 841 |   |
| 842 | Pattern = (char*)tMalloc((len + 1) * sizeof(char));  |
| 843 | tStrcpy(Pattern, pattern);  |
| 844 | CompileInternal();  |
| 845 | }  |
| 846 |   |
| 847 |   |
| 848 | bool tRegex::IsMatch(const char* text) const  |
| 849 | {  |
| 850 | const char* res = 0;  |
| 851 | BOL = text;  |
| 852 | EOL = text + tStrlen(text);  |
| 853 | CurrSubExpr = 0;  |
| 854 | res = MatchNode(Nodes, text, 0);  |
| 855 | if (res == 0 || res != EOL)  |
| 856 | return false;  |
| 857 |   |
| 858 | return true;  |
| 859 | }  |
| 860 |   |
| 861 |   |
| 862 | void tRegex::Search(const char* textBegin, const char* textEnd, tList<Match>& matches) const  |
| 863 | {  |
| 864 | const char* cur = 0;  |
| 865 | int node = First;  |
| 866 | const char* origBegin = textBegin;  |
| 867 | if (textBegin >= textEnd)  |
| 868 | return;  |
| 869 |   |
| 870 | BOL = textBegin;  |
| 871 | EOL = textEnd;  |
| 872 | do  |
| 873 | {  |
| 874 | cur = textBegin;  |
| 875 | while (node != -1)  |
| 876 | {  |
| 877 | CurrSubExpr = 0;  |
| 878 | cur = MatchNode(&Nodes[node], cur, 0);  |
| 879 | if (!cur)  |
| 880 | break;  |
| 881 |   |
| 882 | node = Nodes[node].Next;  |
| 883 | }  |
| 884 |   |
| 885 | textBegin++;  |
| 886 | }  |
| 887 | while (cur == 0 && textBegin != textEnd);  |
| 888 |   |
| 889 | if (cur == 0)  |
| 890 | return;  |
| 891 |   |
| 892 | --textBegin;  |
| 893 |   |
| 894 | // Populate the matches list.  |
| 895 | tAssert(Matches);  |
| 896 | for (int m = 0; m < NumSubExpr; m++)  |
| 897 | {  |
| 898 | MatchInternal& mi = Matches[m];  |
| 899 | matches.Append(new Match(int(mi.Begin-origBegin), mi.Length));  |
| 900 | }  |
| 901 | }  |
| 902 |   |
| 903 |   |
| 904 | }  |
| 905 | |