1 | // tRegex.cpp  |
2 | //  |
3 | // Simple regular expression class. The code is based on TRex but has been significantly modified. The original license  |
4 | // follows:  |
5 | //  |
6 | // Copyright (c) 2003-2006 Alberto Demichelis  |
7 | // This software is provided 'as-is', without any express or implied warranty. In no event will the authors be held  |
8 | // liable for any damages arising from the use of this software.  |
9 | //  |
10 | // Permission is granted to anyone to use this software for any purpose, including commercial applications, and to  |
11 | // alter it and redistribute it freely, subject to the following restrictions:  |
12 | //  |
13 | // 1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software.  |
14 | // If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is  |
15 | // not required.  |
16 | //  |
17 | // 2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original  |
18 | // software.  |
19 | //  |
20 | // 3. This notice may not be removed or altered from any source distribution.  |
21 | //  |
22 | // To be absolutely clear, the tRegex class found here is an 'altered source' version of the original. The alterations  |
23 | // are under the following license:  |
24 | //  |
25 | // Copyright (c) 2006, 2017 Tristan Grimmer.  |
26 | // Permission to use, copy, modify, and/or distribute this software for any purpose with or without fee is hereby  |
27 | // granted, provided that the above copyright notice and this permission notice appear in all copies.  |
28 | //  |
29 | // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL  |
30 | // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT,  |
31 | // INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN  |
32 | // AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR  |
33 | // PERFORMANCE OF THIS SOFTWARE.  |
34 |   |
35 | #include <Foundation/tMemory.h>  |
36 | #include "System/tThrow.h"  |
37 | #include "System/tRegex.h"  |
38 | #include "System/tPrint.h"  |
39 | using namespace tStd;  |
40 | using namespace tMem;  |
41 | //#define REGEX_DEBUG_PRINT  |
42 | namespace tSystem  |
43 | {  |
44 |   |
45 |   |
46 | enum tOperator  |
47 | {  |
48 | // Invalid is 255 on purpose.  |
49 | tOperator_Invalid = tCharInvalid,  |
50 | tOperator_First,  |
51 | tOperator_Greedy = tOperator_First,  |
52 | tOperator_Or,  |
53 | tOperator_Expr,  |
54 | tOperator_NoCapExpr,  |
55 | tOperator_Dot,  |
56 | tOperator_Class,  |
57 | tOperator_CClass,  |
58 | tOperator_NClass,  |
59 | tOperator_Range,  |
60 | tOperator_Char,  |
61 | tOperator_EOL,  |
62 | tOperator_BOL,  |
63 | tOperator_WB,  |
64 | tOperator_NumOperators = tOperator_WB - tOperator_Invalid  |
65 | };  |
66 |   |
67 |   |
68 | #ifdef REGEX_DEBUG_PRINT  |
69 | static const char* OperatorNames[tOperator_NumOperators] =  |
70 | {  |
71 | "Greedy" ,  |
72 | "Or" ,  |
73 | "Expr" ,  |
74 | "NoCapExpr" ,  |
75 | "Dot" ,  |
76 | "Class" ,  |
77 | "CClass" ,  |
78 | "NClass" ,  |
79 | "Range" ,  |
80 | "Char" ,  |
81 | "EOL" ,  |
82 | "BOL" ,  |
83 | "WB"   |
84 | };  |
85 | #endif  |
86 |   |
87 |   |
88 | const static char iSymbol_AnyChar = '.';  |
89 | const static char iSymbol_GreedyOneOrMore = '+';  |
90 | const static char iSymbol_GreedyZeroOrMore = '*';  |
91 | const static char iSymbol_GreedyZeroOrOne = '?';  |
92 | const static char iSymbol_Branch = '|';  |
93 | const static char iSymbol_StringEnd = '$';  |
94 | const static char iSymbol_StringBegin = '^';  |
95 | const static char iSymbol_Escape = '\\';  |
96 |   |
97 |   |
98 | int tRegex::NewNode(int type)  |
99 | {  |
100 | tRegex::Node n;  |
101 | n.Type = type;  |
102 | n.Next = n.Right = n.Left = -1;  |
103 |   |
104 | if (type == tOperator_Expr)  |
105 | n.Right = NumSubExpr++;  |
106 |   |
107 | if (NumNodesAllocated < (NumNodes + 1))  |
108 | {  |
109 | int origSize = NumNodesAllocated * sizeof(tRegex::Node);  |
110 | NumNodesAllocated *= 2;  |
111 | tRegex::Node* newNodes = (tRegex::Node*)tMalloc(NumNodesAllocated * sizeof(tRegex::Node));  |
112 | if (Nodes)  |
113 | {  |
114 | tMemcpy(newNodes, Nodes, origSize);  |
115 | tFree(Nodes);  |
116 | }  |
117 | Nodes = newNodes;  |
118 | }  |
119 |   |
120 | Nodes[NumNodes++] = n;  |
121 | int newid = NumNodes - 1;  |
122 | return newid;  |
123 | }  |
124 |   |
125 |   |
126 | void tRegex::Expect(int n)  |
127 | {  |
128 | if ((*Curr) != n)   |
129 | throw tError("Expected bracket." );  |
130 |   |
131 | Curr++;  |
132 | }  |
133 |   |
134 |   |
135 | char tRegex::EscapeChar()  |
136 | {  |
137 | if (*Curr == iSymbol_Escape)  |
138 | {  |
139 | Curr++;  |
140 | switch (*Curr)  |
141 | {  |
142 | case 'v':  |
143 | Curr++;  |
144 | return '\v';  |
145 |   |
146 | case 'n':  |
147 | Curr++;  |
148 | return '\n';  |
149 |   |
150 | case 't':  |
151 | Curr++;  |
152 | return '\t';  |
153 |   |
154 | case 'r':  |
155 | Curr++;  |
156 | return '\r';  |
157 |   |
158 | case 'f':  |
159 | Curr++;  |
160 | return '\f';  |
161 |   |
162 | default:  |
163 | return (*Curr++);  |
164 | }  |
165 | }  |
166 | else if (!tIsprint(*Curr))  |
167 | throw tError("Letter expected." );  |
168 |   |
169 | return (*Curr++);  |
170 | }  |
171 |   |
172 |   |
173 | int tRegex::CharClass(int classid)  |
174 | {  |
175 | int n = NewNode(tOperator_CClass);  |
176 | Nodes[n].Left = classid;  |
177 | return n;  |
178 | }  |
179 |   |
180 |   |
181 | int tRegex::CharNode(bool isclass)  |
182 | {  |
183 | if (*Curr == iSymbol_Escape)  |
184 | {  |
185 | Curr++;  |
186 | switch (*Curr)  |
187 | {  |
188 | case 'n':  |
189 | Curr++;  |
190 | return NewNode('\n');  |
191 |   |
192 | case 't':  |
193 | Curr++;  |
194 | return NewNode('\t');  |
195 |   |
196 | case 'r':  |
197 | Curr++;  |
198 | return NewNode('\r');  |
199 |   |
200 | case 'f':  |
201 | Curr++;  |
202 | return NewNode('\f');  |
203 |   |
204 | case 'v':  |
205 | Curr++;  |
206 | return NewNode('\v');  |
207 |   |
208 | case 'a': case 'A': case 'w': case 'W':  |
209 | case 's': case 'S': case 'd': case 'D':  |
210 | case 'x': case 'X': case 'c': case 'C':  |
211 | case 'p': case 'P': case 'l': case 'u':  |
212 | {  |
213 | char t = *Curr;  |
214 | Curr++;  |
215 | return CharClass(t);  |
216 | }  |
217 |   |
218 | case 'b':  |
219 | case 'B':  |
220 | if (!isclass)  |
221 | {  |
222 | int node = NewNode(tOperator_WB);  |
223 | Nodes[node].Left = *Curr;  |
224 | Curr++;   |
225 | return node;  |
226 | }  |
227 |   |
228 | default:  |
229 | {  |
230 | char t = *Curr;  |
231 | Curr++;  |
232 | return NewNode(t);  |
233 | }  |
234 | }  |
235 | }  |
236 | else if (!tIsprint(*Curr))  |
237 | {  |
238 | throw tError("Letter expected." );  |
239 | }  |
240 |   |
241 | char t = *Curr;  |
242 | Curr++;   |
243 | return NewNode(t);  |
244 | }  |
245 |   |
246 |   |
247 | int tRegex::Class()  |
248 | {  |
249 | int ret = -1;  |
250 | if (*Curr == iSymbol_StringBegin)  |
251 | {  |
252 | ret = NewNode(tOperator_NClass);  |
253 | Curr++;  |
254 | }  |
255 | else  |
256 | ret = NewNode(tOperator_Class);  |
257 |   |
258 | if (*Curr == ']')  |
259 | throw tError("Empty class." );  |
260 |   |
261 | int first = -1;  |
262 | int chain = ret;  |
263 | while (*Curr != ']' && Curr != EOL)  |
264 | {  |
265 | if (*Curr == '-' && first != -1)  |
266 | {  |
267 | int r;  |
268 | if (*Curr++ == ']')  |
269 | throw tError("Unfinished range." );  |
270 |   |
271 | r = NewNode(tOperator_Range);  |
272 | if (first > *Curr)  |
273 | throw tError("Invalid range." );  |
274 |   |
275 | if (Nodes[first].Type == tOperator_CClass)  |
276 | throw tError("Cannot use character classes in ranges." );  |
277 |   |
278 | Nodes[r].Left = Nodes[first].Type;  |
279 | int t = EscapeChar();  |
280 | Nodes[r].Right = t;  |
281 | Nodes[chain].Next = r;  |
282 | chain = r;  |
283 | first = -1;  |
284 | }  |
285 | else  |
286 | {  |
287 | if (first != -1)  |
288 | {  |
289 | int c = first;  |
290 | Nodes[chain].Next = c;  |
291 | chain = c;  |
292 | }  |
293 | first = CharNode(true);  |
294 | }  |
295 | }  |
296 |   |
297 | if (first != -1)  |
298 | {  |
299 | int c = first;  |
300 | Nodes[chain].Next = c;  |
301 | chain = c;  |
302 | first = -1;  |
303 | }  |
304 |   |
305 | // A hack?  |
306 | Nodes[ret].Left = Nodes[ret].Next;  |
307 | Nodes[ret].Next = -1;  |
308 | return ret;  |
309 | }  |
310 |   |
311 |   |
312 | int tRegex::ParseNumber()  |
313 | {  |
314 | int ret = *Curr - '0';  |
315 | int positions = 10;  |
316 | Curr++;  |
317 |   |
318 | while (tIsdigit(*Curr))  |
319 | {  |
320 | ret = ret*10 + (*Curr++ - '0');  |
321 | if (positions == 1000000000)  |
322 | throw tError("Overflow in numeric constant." );  |
323 |   |
324 | positions *= 10;  |
325 | };  |
326 |   |
327 | return ret;  |
328 | }  |
329 |   |
330 |   |
331 | int tRegex::Element()  |
332 | {  |
333 | int ret = -1;  |
334 | switch (*Curr)  |
335 | {  |
336 | case '(':  |
337 | {  |
338 | int expr;  |
339 | Curr++;  |
340 |   |
341 | if (*Curr =='?')  |
342 | {  |
343 | Curr++;  |
344 | Expect(':');  |
345 | expr = NewNode(tOperator_NoCapExpr);  |
346 | }  |
347 | else  |
348 | expr = NewNode(tOperator_Expr);  |
349 |   |
350 | int newn = ListRec();  |
351 | Nodes[expr].Left = newn;  |
352 | ret = expr;  |
353 | Expect(')');  |
354 | break;  |
355 | }  |
356 |   |
357 | case '[':  |
358 | Curr++;  |
359 | ret = Class();  |
360 | Expect(']');  |
361 | break;  |
362 |   |
363 | case iSymbol_StringEnd:  |
364 | Curr++;  |
365 | ret = NewNode(tOperator_EOL);  |
366 | break;  |
367 |   |
368 | case iSymbol_AnyChar:  |
369 | Curr++;  |
370 | ret = NewNode(tOperator_Dot);  |
371 | break;  |
372 |   |
373 | default:  |
374 | ret = CharNode(false);  |
375 | break;  |
376 | }  |
377 |   |
378 | int op;  |
379 | bool isgreedy = false;  |
380 | uint16 p0 = 0, p1 = 0;  |
381 | switch (*Curr)  |
382 | {  |
383 | case iSymbol_GreedyZeroOrMore:  |
384 | p0 = 0;  |
385 | p1 = 0xFFFF;  |
386 | Curr++;  |
387 | isgreedy = true;  |
388 | break;  |
389 |   |
390 | case iSymbol_GreedyOneOrMore:  |
391 | p0 = 1;  |
392 | p1 = 0xFFFF;  |
393 | Curr++;  |
394 | isgreedy = true;  |
395 | break;  |
396 |   |
397 | case iSymbol_GreedyZeroOrOne:  |
398 | p0 = 0;  |
399 | p1 = 1;  |
400 | Curr++;  |
401 | isgreedy = true;  |
402 | break;  |
403 |   |
404 | case '{':  |
405 | Curr++;  |
406 | if (!tIsdigit(*Curr))  |
407 | throw tError("Number expected." );  |
408 |   |
409 | p0 = uint16(ParseNumber());  |
410 | switch (*Curr)  |
411 | {  |
412 | case '}':  |
413 | p1 = p0;  |
414 | Curr++;  |
415 | break;  |
416 |   |
417 | case ',':  |
418 | Curr++;  |
419 | p1 = 0xFFFF;  |
420 | if (tIsdigit(*Curr))  |
421 | p1 = uint16(ParseNumber());  |
422 | Expect('}');  |
423 | break;  |
424 |   |
425 | default:  |
426 | throw tError("A , or } was expected." );  |
427 | }  |
428 |   |
429 | isgreedy = true;   |
430 | break;  |
431 | }  |
432 |   |
433 | if (isgreedy)  |
434 | {  |
435 | int nnode = NewNode(tOperator_Greedy);  |
436 | op = tOperator_Greedy;  |
437 | Nodes[nnode].Left = ret;  |
438 | Nodes[nnode].Right = ((p0)<<16) | p1;  |
439 | ret = nnode;  |
440 | }  |
441 |   |
442 | if  |
443 | (  |
444 | (*Curr != iSymbol_Branch) && (*Curr != ')') &&  |
445 | (*Curr != iSymbol_GreedyZeroOrMore) && (*Curr != iSymbol_GreedyOneOrMore) &&  |
446 | (*Curr != '\0')  |
447 | )  |
448 | {  |
449 | int nnode = Element();  |
450 | Nodes[ret].Next = nnode;  |
451 | }  |
452 |   |
453 | return ret;  |
454 | }  |
455 |   |
456 |   |
457 | int tRegex::ListRec()  |
458 | {  |
459 | int ret = -1, e;  |
460 | if (*Curr == iSymbol_StringBegin)  |
461 | {  |
462 | Curr++;  |
463 | ret = NewNode(tOperator_BOL);  |
464 | }  |
465 |   |
466 | e = Element();  |
467 | if (ret != -1)  |
468 | Nodes[ret].Next = e;  |
469 | else  |
470 | ret = e;  |
471 |   |
472 | if (*Curr == iSymbol_Branch)  |
473 | {  |
474 | int temp, tright;  |
475 | Curr++;  |
476 | temp = NewNode(tOperator_Or);  |
477 | Nodes[temp].Left = ret;  |
478 | tright = ListRec();  |
479 | Nodes[temp].Right = tright;  |
480 | ret = temp;  |
481 | }  |
482 |   |
483 | return ret;  |
484 | }  |
485 |   |
486 |   |
487 | bool tRegex::MatchCClass(int cclass, char c)  |
488 | {  |
489 | switch (cclass)  |
490 | {  |
491 | case 'a':  |
492 | return tIsalpha(c) ? true : false;  |
493 |   |
494 | case 'A':  |
495 | return !tIsalpha(c) ? true : false;  |
496 |   |
497 | case 'w':  |
498 | return (tIsalnum(c) || c == '_') ? true : false;  |
499 |   |
500 | case 'W':  |
501 | return (!tIsalnum(c) && c != '_') ? true : false;  |
502 |   |
503 | case 's':  |
504 | return tIsspace(c) ? true : false;  |
505 |   |
506 | case 'S':  |
507 | return !tIsspace(c) ? true : false;  |
508 |   |
509 | case 'd':  |
510 | return tIsdigit(c) ? true : false;  |
511 |   |
512 | case 'D':  |
513 | return !tIsdigit(c) ? true : false;  |
514 |   |
515 | case 'x':  |
516 | return tIsxdigit(c) ? true : false;  |
517 |   |
518 | case 'X':  |
519 | return !tIsxdigit(c) ? true : false;  |
520 |   |
521 | case 'c':  |
522 | return tIscntrl(c) ? true : false;  |
523 |   |
524 | case 'C':  |
525 | return !tIscntrl(c) ? true : false;  |
526 |   |
527 | case 'p':  |
528 | return tIspunct(c) ? true : false;  |
529 |   |
530 | case 'P':  |
531 | return !tIspunct(c) ? true : false;  |
532 |   |
533 | case 'l':  |
534 | return tIslower(c) ? true : false;  |
535 |   |
536 | case 'u':  |
537 | return tIsupper(c) ? true : false;  |
538 | }  |
539 |   |
540 | return false;  |
541 | }  |
542 |   |
543 |   |
544 | bool tRegex::MatchClass(const tRegex::Node* node, char c) const  |
545 | {  |
546 | do  |
547 | {  |
548 | switch (node->Type)  |
549 | {  |
550 | case tOperator_Range:  |
551 | if (c >= node->Left && c <= node->Right)  |
552 | return true;  |
553 | break;  |
554 |   |
555 | case tOperator_CClass:  |
556 | if (MatchCClass(node->Left, c))  |
557 | return true;  |
558 | break;  |
559 |   |
560 | default:  |
561 | if (c == node->Type)  |
562 | return true;  |
563 | }  |
564 | }  |
565 | while ((node->Next != -1) && (node = &Nodes[node->Next]));  |
566 |   |
567 | return false;  |
568 | }  |
569 |   |
570 |   |
571 | const char* tRegex::MatchNode(const tRegex::Node* node, const char* str, const tRegex::Node* next) const  |
572 | {  |
573 | int type = node->Type;  |
574 | switch (type)  |
575 | {  |
576 | case tOperator_Greedy:  |
577 | {  |
578 | const tRegex::Node* greedystop = 0;  |
579 | int p0 = (node->Right >> 16) & 0x0000FFFF, p1 = node->Right & 0x0000FFFF, nmaches = 0;  |
580 | const char* s = str, *good = str;  |
581 |   |
582 | if (node->Next != -1)  |
583 | greedystop = &Nodes[node->Next];  |
584 | else  |
585 | greedystop = next;  |
586 |   |
587 | while ((nmaches == 0xFFFF || nmaches < p1))  |
588 | {  |
589 | if (!(s = MatchNode(&Nodes[node->Left], s, greedystop)))  |
590 | break;  |
591 |   |
592 | nmaches++;  |
593 | good = s;  |
594 | if (greedystop)  |
595 | {  |
596 | // Checks that 0 matches satisfy the expression and if so skips.  |
597 | // If not would always stop (for instance if is a '?').  |
598 | if  |
599 | (  |
600 | greedystop->Type != tOperator_Greedy ||  |
601 | (greedystop->Type == tOperator_Greedy && ((greedystop->Right >> 16)&0x0000FFFF) != 0)  |
602 | )  |
603 | {  |
604 | tRegex::Node* gnext = 0;  |
605 | if (greedystop->Next != -1)  |
606 | gnext = &Nodes[greedystop->Next];  |
607 | else if (next && next->Next != -1)  |
608 | gnext = &Nodes[next->Next];  |
609 |   |
610 | const char* stop = MatchNode(greedystop, s, gnext);  |
611 | if (stop)  |
612 | {  |
613 | // If satisfied stop it.  |
614 | if (p0 == p1 && p0 == nmaches)  |
615 | break;  |
616 | else if (nmaches >= p0 && p1 == 0xFFFF)  |
617 | break;  |
618 | else if (nmaches >= p0 && nmaches <= p1)  |
619 | break;  |
620 | }  |
621 | }  |
622 | }  |
623 |   |
624 | if (s >= EOL)  |
625 | break;  |
626 | }  |
627 |   |
628 | if (p0 == p1 && p0 == nmaches)  |
629 | return good;  |
630 |   |
631 | else if (nmaches >= p0 && p1 == 0xFFFF)  |
632 | return good;  |
633 |   |
634 | else if (nmaches >= p0 && nmaches <= p1)  |
635 | return good;  |
636 |   |
637 | return nullptr;  |
638 | }  |
639 |   |
640 | case tOperator_Or:  |
641 | {  |
642 | const char* asd = str;  |
643 | tRegex::Node* temp = &Nodes[node->Left];  |
644 | while ((asd = MatchNode(temp, asd, 0)))  |
645 | {  |
646 | if (temp->Next != -1)  |
647 | temp = &Nodes[temp->Next];  |
648 | else  |
649 | return asd;  |
650 | }  |
651 |   |
652 | asd = str;  |
653 | temp = &Nodes[node->Right];  |
654 | while ((asd = MatchNode(temp, asd, 0)))  |
655 | {  |
656 | if (temp->Next != -1)  |
657 | temp = &Nodes[temp->Next];  |
658 | else  |
659 | return asd;  |
660 | }  |
661 | return nullptr;  |
662 | }  |
663 |   |
664 | case tOperator_Expr:  |
665 | case tOperator_NoCapExpr:  |
666 | {  |
667 | tRegex::Node* n = &Nodes[node->Left];  |
668 | const char* cur = str;  |
669 | int capture = -1;  |
670 | if (node->Type != tOperator_NoCapExpr && node->Right == CurrSubExpr)  |
671 | {  |
672 | capture = CurrSubExpr;  |
673 | Matches[capture].Begin = cur;  |
674 | CurrSubExpr++;  |
675 | }  |
676 |   |
677 | do  |
678 | {  |
679 | const tRegex::Node* subnext = 0;  |
680 | if (n->Next != -1)  |
681 | subnext = &Nodes[n->Next];  |
682 | else  |
683 | subnext = next;  |
684 |   |
685 | if (!(cur = MatchNode(n, cur, subnext)))  |
686 | {  |
687 | if (capture != -1)  |
688 | {  |
689 | Matches[capture].Begin = 0;  |
690 | Matches[capture].Length = 0;  |
691 | }  |
692 | return nullptr;  |
693 | }  |
694 | }  |
695 | while ((n->Next != -1) && (n = &Nodes[n->Next]));  |
696 |   |
697 | if (capture != -1)  |
698 | Matches[capture].Length = int(cur - Matches[capture].Begin);  |
699 |   |
700 | return cur;  |
701 | }  |
702 |   |
703 | case tOperator_WB:  |
704 | if  |
705 | (  |
706 | (str == BOL && !tIsspace(*str)) || (str == EOL && !tIsspace(*(str-1))) ||  |
707 | (!tIsspace(*str) && tIsspace(*(str+1))) || (tIsspace(*str) && !tIsspace(*(str+1)))  |
708 | )  |
709 | {  |
710 | return (node->Left == 'b') ? str : 0;  |
711 | }  |
712 | return (node->Left == 'b') ? 0 : str;  |
713 |   |
714 | case tOperator_BOL:  |
715 | if (str == BOL)  |
716 | return str;  |
717 | return nullptr;  |
718 |   |
719 | case tOperator_EOL:  |
720 | if (str == EOL)  |
721 | return str;  |
722 | return nullptr;  |
723 |   |
724 | case tOperator_Dot:  |
725 | str++;  |
726 | return str;  |
727 |   |
728 | case tOperator_NClass:  |
729 | case tOperator_Class:  |
730 | if  |
731 | (  |
732 | MatchClass(&Nodes[node->Left], *str) ?  |
733 | (type == tOperator_Class ? true : false) :  |
734 | (type == tOperator_NClass ? true : false)  |
735 | )  |
736 | {  |
737 | str++;  |
738 | return str;  |
739 | }  |
740 | return nullptr;  |
741 |   |
742 | case tOperator_CClass:  |
743 | if (MatchCClass(node->Left, *str))  |
744 | {  |
745 | str++;  |
746 | return str;  |
747 | }  |
748 | return nullptr;  |
749 |   |
750 | default:  |
751 | if (*str != node->Type)  |
752 | return nullptr;  |
753 | str++;  |
754 | return str;  |
755 | }  |
756 |   |
757 | return nullptr;  |
758 | }  |
759 |   |
760 |   |
761 | void tRegex::CompileInternal()  |
762 | {  |
763 | tAssert(Pattern && !EOL && !BOL && !NumNodes && !Matches && !NumSubExpr);  |
764 | Curr = Pattern;  |
765 | NumNodesAllocated = tStrlen(Pattern) * sizeof(char);  |
766 | Nodes = (tRegex::Node*)tMalloc(NumNodesAllocated * sizeof(tRegex::Node));  |
767 |   |
768 | First = NewNode(tOperator_Expr);  |
769 | int res = ListRec();  |
770 | Nodes[First].Left = res;  |
771 | if (*Curr != '\0')  |
772 | throw tError("Unexpected character." );  |
773 |   |
774 | #ifdef REGEX_DEBUG_PRINT  |
775 | int nsize = NumNodes;  |
776 | tRegex::Node* t = &Nodes[0];  |
777 | tPrintf("\n" );  |
778 | for (int i = 0;i < nsize; i++)  |
779 | {  |
780 | if (Nodes[i].Type >= tOperator_First)  |
781 | tPrintf("[%02d] %10s " , i, OperatorNames[Nodes[i].Type-tOperator_First]);  |
782 | else  |
783 | tPrintf("[%02d] %10c " , i, Nodes[i].Type);  |
784 | tPrintf("left %02d right %02d next %02d\n" , Nodes[i].Left, Nodes[i].Right, Nodes[i].Next);  |
785 | }  |
786 | tPrintf("\n" );  |
787 | #endif  |
788 |   |
789 | Matches = (MatchInternal*)tMalloc(NumSubExpr * sizeof(MatchInternal));  |
790 | tMemset(Matches, 0, NumSubExpr * sizeof(MatchInternal));  |
791 | }  |
792 |   |
793 |   |
794 | void tRegex::Clear()  |
795 | {  |
796 | CurrSubExpr = 0;  |
797 | if (Matches)  |
798 | tFree(Matches);  |
799 | Matches = 0;  |
800 |   |
801 | NumSubExpr = 0;  |
802 | NumNodes = 0;  |
803 | NumNodesAllocated = 0;  |
804 |   |
805 | if (Nodes)  |
806 | tFree(Nodes);  |
807 | Nodes = 0;  |
808 |   |
809 | First = -1;  |
810 | Curr = 0;  |
811 | BOL = 0;  |
812 | EOL = 0;  |
813 |   |
814 | if (Pattern)  |
815 | tFree(Pattern);  |
816 | Pattern = 0;  |
817 | }  |
818 |   |
819 |   |
820 | void tRegex::Compile(const tString& pattern)  |
821 | {  |
822 | Clear();  |
823 | if (pattern.Length() == 0)  |
824 | return;  |
825 |   |
826 | Pattern = (char*)tMalloc((pattern.Length() + 1) * sizeof(char));  |
827 | tStrcpy(Pattern, pattern.ConstText());  |
828 | CompileInternal();  |
829 | }  |
830 |   |
831 |   |
832 | void tRegex::Compile(const char* pattern)  |
833 | {  |
834 | Clear();  |
835 | if (!pattern)  |
836 | return;  |
837 |   |
838 | int len = tStrlen(pattern);  |
839 | if (!len)  |
840 | return;  |
841 |   |
842 | Pattern = (char*)tMalloc((len + 1) * sizeof(char));  |
843 | tStrcpy(Pattern, pattern);  |
844 | CompileInternal();  |
845 | }  |
846 |   |
847 |   |
848 | bool tRegex::IsMatch(const char* text) const  |
849 | {  |
850 | const char* res = 0;  |
851 | BOL = text;  |
852 | EOL = text + tStrlen(text);  |
853 | CurrSubExpr = 0;  |
854 | res = MatchNode(Nodes, text, 0);  |
855 | if (res == 0 || res != EOL)  |
856 | return false;  |
857 |   |
858 | return true;  |
859 | }  |
860 |   |
861 |   |
862 | void tRegex::Search(const char* textBegin, const char* textEnd, tList<Match>& matches) const  |
863 | {  |
864 | const char* cur = 0;  |
865 | int node = First;  |
866 | const char* origBegin = textBegin;  |
867 | if (textBegin >= textEnd)  |
868 | return;  |
869 |   |
870 | BOL = textBegin;  |
871 | EOL = textEnd;  |
872 | do  |
873 | {  |
874 | cur = textBegin;  |
875 | while (node != -1)  |
876 | {  |
877 | CurrSubExpr = 0;  |
878 | cur = MatchNode(&Nodes[node], cur, 0);  |
879 | if (!cur)  |
880 | break;  |
881 |   |
882 | node = Nodes[node].Next;  |
883 | }  |
884 |   |
885 | textBegin++;  |
886 | }  |
887 | while (cur == 0 && textBegin != textEnd);  |
888 |   |
889 | if (cur == 0)  |
890 | return;  |
891 |   |
892 | --textBegin;  |
893 |   |
894 | // Populate the matches list.  |
895 | tAssert(Matches);  |
896 | for (int m = 0; m < NumSubExpr; m++)  |
897 | {  |
898 | MatchInternal& mi = Matches[m];  |
899 | matches.Append(new Match(int(mi.Begin-origBegin), mi.Length));  |
900 | }  |
901 | }  |
902 |   |
903 |   |
904 | }  |
905 | |