1// tRegex.cpp 
2// 
3// Simple regular expression class. The code is based on TRex but has been significantly modified. The original license 
4// follows: 
5// 
6// Copyright (c) 2003-2006 Alberto Demichelis 
7// This software is provided 'as-is', without any express or implied warranty. In no event will the authors be held 
8// liable for any damages arising from the use of this software. 
9// 
10// Permission is granted to anyone to use this software for any purpose, including commercial applications, and to 
11// alter it and redistribute it freely, subject to the following restrictions: 
12// 
13// 1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. 
14// If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is 
15// not required. 
16// 
17// 2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original 
18// software. 
19// 
20// 3. This notice may not be removed or altered from any source distribution. 
21// 
22// To be absolutely clear, the tRegex class found here is an 'altered source' version of the original. The alterations 
23// are under the following license: 
24// 
25// Copyright (c) 2006, 2017 Tristan Grimmer. 
26// Permission to use, copy, modify, and/or distribute this software for any purpose with or without fee is hereby 
27// granted, provided that the above copyright notice and this permission notice appear in all copies. 
28// 
29// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL 
30// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, 
31// INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN 
32// AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 
33// PERFORMANCE OF THIS SOFTWARE. 
34 
35#include <Foundation/tMemory.h> 
36#include "System/tThrow.h" 
37#include "System/tRegex.h" 
38#include "System/tPrint.h" 
39using namespace tStd
40using namespace tMem
41//#define REGEX_DEBUG_PRINT 
42namespace tSystem 
43
44 
45 
46enum tOperator 
47
48 // Invalid is 255 on purpose. 
49 tOperator_Invalid = tCharInvalid
50 tOperator_First
51 tOperator_Greedy = tOperator_First
52 tOperator_Or
53 tOperator_Expr
54 tOperator_NoCapExpr
55 tOperator_Dot
56 tOperator_Class
57 tOperator_CClass
58 tOperator_NClass
59 tOperator_Range
60 tOperator_Char
61 tOperator_EOL
62 tOperator_BOL
63 tOperator_WB
64 tOperator_NumOperators = tOperator_WB - tOperator_Invalid 
65}; 
66 
67 
68#ifdef REGEX_DEBUG_PRINT 
69static const char* OperatorNames[tOperator_NumOperators] = 
70
71 "Greedy"
72 "Or"
73 "Expr"
74 "NoCapExpr"
75 "Dot"
76 "Class"
77 "CClass"
78 "NClass"
79 "Range"
80 "Char"
81 "EOL"
82 "BOL"
83 "WB" 
84}; 
85#endif 
86 
87 
88const static char iSymbol_AnyChar = '.'
89const static char iSymbol_GreedyOneOrMore = '+'
90const static char iSymbol_GreedyZeroOrMore = '*'
91const static char iSymbol_GreedyZeroOrOne = '?'
92const static char iSymbol_Branch = '|'
93const static char iSymbol_StringEnd = '$'
94const static char iSymbol_StringBegin = '^'
95const static char iSymbol_Escape = '\\'
96 
97 
98int tRegex::NewNode(int type
99
100 tRegex::Node n
101 n.Type = type
102 n.Next = n.Right = n.Left = -1
103 
104 if (type == tOperator_Expr
105 n.Right = NumSubExpr++; 
106 
107 if (NumNodesAllocated < (NumNodes + 1)) 
108
109 int origSize = NumNodesAllocated * sizeof(tRegex::Node); 
110 NumNodesAllocated *= 2
111 tRegex::Node* newNodes = (tRegex::Node*)tMalloc(NumNodesAllocated * sizeof(tRegex::Node)); 
112 if (Nodes
113
114 tMemcpy(newNodes, Nodes, origSize); 
115 tFree(Nodes); 
116
117 Nodes = newNodes
118
119 
120 Nodes[NumNodes++] = n
121 int newid = NumNodes - 1
122 return newid
123
124 
125 
126void tRegex::Expect(int n
127
128 if ((*Curr) != n)  
129 throw tError("Expected bracket."); 
130 
131 Curr++; 
132
133 
134 
135char tRegex::EscapeChar() 
136
137 if (*Curr == iSymbol_Escape
138
139 Curr++; 
140 switch (*Curr
141
142 case 'v'
143 Curr++; 
144 return '\v'
145 
146 case 'n'
147 Curr++; 
148 return '\n'
149 
150 case 't'
151 Curr++; 
152 return '\t'
153 
154 case 'r'
155 Curr++; 
156 return '\r'
157 
158 case 'f'
159 Curr++; 
160 return '\f'
161 
162 default
163 return (*Curr++); 
164
165
166 else if (!tIsprint(*Curr)) 
167 throw tError("Letter expected."); 
168 
169 return (*Curr++); 
170
171 
172 
173int tRegex::CharClass(int classid
174
175 int n = NewNode(tOperator_CClass); 
176 Nodes[n].Left = classid
177 return n
178
179 
180 
181int tRegex::CharNode(bool isclass
182
183 if (*Curr == iSymbol_Escape
184
185 Curr++; 
186 switch (*Curr
187
188 case 'n'
189 Curr++; 
190 return NewNode('\n'); 
191 
192 case 't'
193 Curr++; 
194 return NewNode('\t'); 
195 
196 case 'r'
197 Curr++; 
198 return NewNode('\r'); 
199 
200 case 'f'
201 Curr++; 
202 return NewNode('\f'); 
203 
204 case 'v'
205 Curr++; 
206 return NewNode('\v'); 
207 
208 case 'a': case 'A': case 'w': case 'W'
209 case 's': case 'S': case 'd': case 'D'
210 case 'x': case 'X': case 'c': case 'C'
211 case 'p': case 'P': case 'l': case 'u'
212
213 char t = *Curr
214 Curr++; 
215 return CharClass(t); 
216
217 
218 case 'b'
219 case 'B'
220 if (!isclass
221
222 int node = NewNode(tOperator_WB); 
223 Nodes[node].Left = *Curr
224 Curr++;  
225 return node
226
227 
228 default
229
230 char t = *Curr
231 Curr++; 
232 return NewNode(t); 
233
234
235
236 else if (!tIsprint(*Curr)) 
237
238 throw tError("Letter expected."); 
239
240 
241 char t = *Curr
242 Curr++;  
243 return NewNode(t); 
244
245 
246 
247int tRegex::Class() 
248
249 int ret = -1
250 if (*Curr == iSymbol_StringBegin
251
252 ret = NewNode(tOperator_NClass); 
253 Curr++; 
254
255 else 
256 ret = NewNode(tOperator_Class); 
257 
258 if (*Curr == ']'
259 throw tError("Empty class."); 
260 
261 int first = -1
262 int chain = ret
263 while (*Curr != ']' && Curr != EOL
264
265 if (*Curr == '-' && first != -1
266
267 int r
268 if (*Curr++ == ']'
269 throw tError("Unfinished range."); 
270 
271 r = NewNode(tOperator_Range); 
272 if (first > *Curr
273 throw tError("Invalid range."); 
274 
275 if (Nodes[first].Type == tOperator_CClass
276 throw tError("Cannot use character classes in ranges."); 
277 
278 Nodes[r].Left = Nodes[first].Type
279 int t = EscapeChar(); 
280 Nodes[r].Right = t
281 Nodes[chain].Next = r
282 chain = r
283 first = -1
284
285 else 
286
287 if (first != -1
288
289 int c = first
290 Nodes[chain].Next = c
291 chain = c
292
293 first = CharNode(true); 
294
295
296 
297 if (first != -1
298
299 int c = first
300 Nodes[chain].Next = c
301 chain = c
302 first = -1
303
304 
305 // A hack? 
306 Nodes[ret].Left = Nodes[ret].Next
307 Nodes[ret].Next = -1
308 return ret
309
310 
311 
312int tRegex::ParseNumber() 
313
314 int ret = *Curr - '0'
315 int positions = 10
316 Curr++; 
317 
318 while (tIsdigit(*Curr)) 
319
320 ret = ret*10 + (*Curr++ - '0'); 
321 if (positions == 1000000000
322 throw tError("Overflow in numeric constant."); 
323 
324 positions *= 10
325 }; 
326 
327 return ret
328
329 
330 
331int tRegex::Element() 
332
333 int ret = -1
334 switch (*Curr
335
336 case '('
337
338 int expr
339 Curr++; 
340 
341 if (*Curr =='?'
342
343 Curr++; 
344 Expect(':'); 
345 expr = NewNode(tOperator_NoCapExpr); 
346
347 else 
348 expr = NewNode(tOperator_Expr); 
349 
350 int newn = ListRec(); 
351 Nodes[expr].Left = newn
352 ret = expr
353 Expect(')'); 
354 break
355
356 
357 case '['
358 Curr++; 
359 ret = Class(); 
360 Expect(']'); 
361 break
362 
363 case iSymbol_StringEnd
364 Curr++; 
365 ret = NewNode(tOperator_EOL); 
366 break
367 
368 case iSymbol_AnyChar
369 Curr++; 
370 ret = NewNode(tOperator_Dot); 
371 break
372 
373 default
374 ret = CharNode(false); 
375 break
376
377 
378 int op
379 bool isgreedy = false
380 uint16 p0 = 0, p1 = 0
381 switch (*Curr
382
383 case iSymbol_GreedyZeroOrMore
384 p0 = 0
385 p1 = 0xFFFF
386 Curr++; 
387 isgreedy = true
388 break
389 
390 case iSymbol_GreedyOneOrMore
391 p0 = 1
392 p1 = 0xFFFF
393 Curr++; 
394 isgreedy = true
395 break
396 
397 case iSymbol_GreedyZeroOrOne
398 p0 = 0
399 p1 = 1
400 Curr++; 
401 isgreedy = true
402 break
403 
404 case '{'
405 Curr++; 
406 if (!tIsdigit(*Curr)) 
407 throw tError("Number expected."); 
408 
409 p0 = uint16(ParseNumber()); 
410 switch (*Curr
411
412 case '}'
413 p1 = p0
414 Curr++; 
415 break
416 
417 case ','
418 Curr++; 
419 p1 = 0xFFFF
420 if (tIsdigit(*Curr)) 
421 p1 = uint16(ParseNumber()); 
422 Expect('}'); 
423 break
424 
425 default
426 throw tError("A , or } was expected."); 
427
428 
429 isgreedy = true;  
430 break
431
432 
433 if (isgreedy
434
435 int nnode = NewNode(tOperator_Greedy); 
436 op = tOperator_Greedy
437 Nodes[nnode].Left = ret
438 Nodes[nnode].Right = ((p0)<<16) | p1
439 ret = nnode
440
441 
442 if 
443
444 (*Curr != iSymbol_Branch) && (*Curr != ')') && 
445 (*Curr != iSymbol_GreedyZeroOrMore) && (*Curr != iSymbol_GreedyOneOrMore) && 
446 (*Curr != '\0'
447
448
449 int nnode = Element(); 
450 Nodes[ret].Next = nnode
451
452 
453 return ret
454
455 
456 
457int tRegex::ListRec() 
458
459 int ret = -1, e
460 if (*Curr == iSymbol_StringBegin
461
462 Curr++; 
463 ret = NewNode(tOperator_BOL); 
464
465 
466 e = Element(); 
467 if (ret != -1
468 Nodes[ret].Next = e
469 else 
470 ret = e
471 
472 if (*Curr == iSymbol_Branch
473
474 int temp, tright
475 Curr++; 
476 temp = NewNode(tOperator_Or); 
477 Nodes[temp].Left = ret
478 tright = ListRec(); 
479 Nodes[temp].Right = tright
480 ret = temp
481
482 
483 return ret
484
485 
486 
487bool tRegex::MatchCClass(int cclass, char c
488
489 switch (cclass
490
491 case 'a'
492 return tIsalpha(c) ? true : false
493 
494 case 'A'
495 return !tIsalpha(c) ? true : false
496 
497 case 'w'
498 return (tIsalnum(c) || c == '_') ? true : false
499 
500 case 'W'
501 return (!tIsalnum(c) && c != '_') ? true : false
502 
503 case 's'
504 return tIsspace(c) ? true : false
505 
506 case 'S'
507 return !tIsspace(c) ? true : false
508 
509 case 'd'
510 return tIsdigit(c) ? true : false
511 
512 case 'D'
513 return !tIsdigit(c) ? true : false
514 
515 case 'x'
516 return tIsxdigit(c) ? true : false
517 
518 case 'X'
519 return !tIsxdigit(c) ? true : false
520 
521 case 'c'
522 return tIscntrl(c) ? true : false
523 
524 case 'C'
525 return !tIscntrl(c) ? true : false
526 
527 case 'p'
528 return tIspunct(c) ? true : false
529 
530 case 'P'
531 return !tIspunct(c) ? true : false
532 
533 case 'l'
534 return tIslower(c) ? true : false
535 
536 case 'u'
537 return tIsupper(c) ? true : false
538
539 
540 return false
541
542 
543 
544bool tRegex::MatchClass(const tRegex::Node* node, char c) const 
545
546 do 
547
548 switch (node->Type
549
550 case tOperator_Range
551 if (c >= node->Left && c <= node->Right
552 return true
553 break
554 
555 case tOperator_CClass
556 if (MatchCClass(node->Left, c)) 
557 return true
558 break
559 
560 default
561 if (c == node->Type
562 return true
563
564
565 while ((node->Next != -1) && (node = &Nodes[node->Next])); 
566 
567 return false
568
569 
570 
571const char* tRegex::MatchNode(const tRegex::Node* node, const char* str, const tRegex::Node* next) const 
572
573 int type = node->Type
574 switch (type
575
576 case tOperator_Greedy
577
578 const tRegex::Node* greedystop = 0
579 int p0 = (node->Right >> 16) & 0x0000FFFF, p1 = node->Right & 0x0000FFFF, nmaches = 0
580 const char* s = str, *good = str
581 
582 if (node->Next != -1
583 greedystop = &Nodes[node->Next]; 
584 else 
585 greedystop = next
586 
587 while ((nmaches == 0xFFFF || nmaches < p1)) 
588
589 if (!(s = MatchNode(&Nodes[node->Left], s, greedystop))) 
590 break
591 
592 nmaches++; 
593 good = s
594 if (greedystop
595
596 // Checks that 0 matches satisfy the expression and if so skips. 
597 // If not would always stop (for instance if is a '?'). 
598 if 
599
600 greedystop->Type != tOperator_Greedy || 
601 (greedystop->Type == tOperator_Greedy && ((greedystop->Right >> 16)&0x0000FFFF) != 0
602
603
604 tRegex::Node* gnext = 0
605 if (greedystop->Next != -1
606 gnext = &Nodes[greedystop->Next]; 
607 else if (next && next->Next != -1
608 gnext = &Nodes[next->Next]; 
609 
610 const char* stop = MatchNode(greedystop, s, gnext); 
611 if (stop
612
613 // If satisfied stop it. 
614 if (p0 == p1 && p0 == nmaches
615 break
616 else if (nmaches >= p0 && p1 == 0xFFFF
617 break
618 else if (nmaches >= p0 && nmaches <= p1
619 break
620
621
622
623 
624 if (s >= EOL
625 break
626
627 
628 if (p0 == p1 && p0 == nmaches
629 return good
630 
631 else if (nmaches >= p0 && p1 == 0xFFFF
632 return good
633 
634 else if (nmaches >= p0 && nmaches <= p1
635 return good
636 
637 return nullptr
638
639 
640 case tOperator_Or
641
642 const char* asd = str
643 tRegex::Node* temp = &Nodes[node->Left]; 
644 while ((asd = MatchNode(temp, asd, 0))) 
645
646 if (temp->Next != -1
647 temp = &Nodes[temp->Next]; 
648 else 
649 return asd
650
651 
652 asd = str
653 temp = &Nodes[node->Right]; 
654 while ((asd = MatchNode(temp, asd, 0))) 
655
656 if (temp->Next != -1
657 temp = &Nodes[temp->Next]; 
658 else 
659 return asd
660
661 return nullptr
662
663 
664 case tOperator_Expr
665 case tOperator_NoCapExpr
666
667 tRegex::Node* n = &Nodes[node->Left]; 
668 const char* cur = str
669 int capture = -1
670 if (node->Type != tOperator_NoCapExpr && node->Right == CurrSubExpr
671
672 capture = CurrSubExpr
673 Matches[capture].Begin = cur
674 CurrSubExpr++; 
675
676 
677 do 
678
679 const tRegex::Node* subnext = 0
680 if (n->Next != -1
681 subnext = &Nodes[n->Next]; 
682 else 
683 subnext = next
684 
685 if (!(cur = MatchNode(n, cur, subnext))) 
686
687 if (capture != -1
688
689 Matches[capture].Begin = 0
690 Matches[capture].Length = 0
691
692 return nullptr
693
694
695 while ((n->Next != -1) && (n = &Nodes[n->Next])); 
696 
697 if (capture != -1
698 Matches[capture].Length = int(cur - Matches[capture].Begin); 
699 
700 return cur
701
702 
703 case tOperator_WB
704 if 
705
706 (str == BOL && !tIsspace(*str)) || (str == EOL && !tIsspace(*(str-1))) || 
707 (!tIsspace(*str) && tIsspace(*(str+1))) || (tIsspace(*str) && !tIsspace(*(str+1))) 
708
709
710 return (node->Left == 'b') ? str : 0
711
712 return (node->Left == 'b') ? 0 : str
713 
714 case tOperator_BOL
715 if (str == BOL
716 return str
717 return nullptr
718 
719 case tOperator_EOL
720 if (str == EOL
721 return str
722 return nullptr
723 
724 case tOperator_Dot
725 str++; 
726 return str
727 
728 case tOperator_NClass
729 case tOperator_Class
730 if 
731
732 MatchClass(&Nodes[node->Left], *str) ? 
733 (type == tOperator_Class ? true : false) : 
734 (type == tOperator_NClass ? true : false
735
736
737 str++; 
738 return str
739
740 return nullptr
741 
742 case tOperator_CClass
743 if (MatchCClass(node->Left, *str)) 
744
745 str++; 
746 return str
747
748 return nullptr
749 
750 default
751 if (*str != node->Type
752 return nullptr
753 str++; 
754 return str
755
756 
757 return nullptr
758
759 
760 
761void tRegex::CompileInternal() 
762
763 tAssert(Pattern && !EOL && !BOL && !NumNodes && !Matches && !NumSubExpr); 
764 Curr = Pattern
765 NumNodesAllocated = tStrlen(Pattern) * sizeof(char); 
766 Nodes = (tRegex::Node*)tMalloc(NumNodesAllocated * sizeof(tRegex::Node)); 
767 
768 First = NewNode(tOperator_Expr); 
769 int res = ListRec(); 
770 Nodes[First].Left = res
771 if (*Curr != '\0'
772 throw tError("Unexpected character."); 
773 
774 #ifdef REGEX_DEBUG_PRINT 
775 int nsize = NumNodes; 
776 tRegex::Node* t = &Nodes[0]; 
777 tPrintf("\n"); 
778 for (int i = 0;i < nsize; i++) 
779
780 if (Nodes[i].Type >= tOperator_First) 
781 tPrintf("[%02d] %10s ", i, OperatorNames[Nodes[i].Type-tOperator_First]); 
782 else 
783 tPrintf("[%02d] %10c ", i, Nodes[i].Type); 
784 tPrintf("left %02d right %02d next %02d\n", Nodes[i].Left, Nodes[i].Right, Nodes[i].Next); 
785
786 tPrintf("\n"); 
787 #endif 
788 
789 Matches = (MatchInternal*)tMalloc(NumSubExpr * sizeof(MatchInternal)); 
790 tMemset(Matches, 0, NumSubExpr * sizeof(MatchInternal)); 
791
792 
793 
794void tRegex::Clear() 
795
796 CurrSubExpr = 0
797 if (Matches
798 tFree(Matches); 
799 Matches = 0
800 
801 NumSubExpr = 0
802 NumNodes = 0
803 NumNodesAllocated = 0
804 
805 if (Nodes
806 tFree(Nodes); 
807 Nodes = 0
808 
809 First = -1
810 Curr = 0
811 BOL = 0
812 EOL = 0
813 
814 if (Pattern
815 tFree(Pattern); 
816 Pattern = 0
817
818 
819 
820void tRegex::Compile(const tString& pattern
821
822 Clear(); 
823 if (pattern.Length() == 0
824 return
825 
826 Pattern = (char*)tMalloc((pattern.Length() + 1) * sizeof(char)); 
827 tStrcpy(Pattern, pattern.ConstText()); 
828 CompileInternal(); 
829
830 
831 
832void tRegex::Compile(const char* pattern
833
834 Clear(); 
835 if (!pattern
836 return
837 
838 int len = tStrlen(pattern); 
839 if (!len
840 return
841 
842 Pattern = (char*)tMalloc((len + 1) * sizeof(char)); 
843 tStrcpy(Pattern, pattern); 
844 CompileInternal(); 
845
846 
847 
848bool tRegex::IsMatch(const char* text) const 
849
850 const char* res = 0
851 BOL = text
852 EOL = text + tStrlen(text); 
853 CurrSubExpr = 0
854 res = MatchNode(Nodes, text, 0); 
855 if (res == 0 || res != EOL
856 return false
857 
858 return true
859
860 
861 
862void tRegex::Search(const char* textBegin, const char* textEnd, tList<Match>& matches) const 
863
864 const char* cur = 0
865 int node = First
866 const char* origBegin = textBegin
867 if (textBegin >= textEnd
868 return
869 
870 BOL = textBegin
871 EOL = textEnd
872 do 
873
874 cur = textBegin
875 while (node != -1
876
877 CurrSubExpr = 0
878 cur = MatchNode(&Nodes[node], cur, 0); 
879 if (!cur
880 break
881 
882 node = Nodes[node].Next
883
884 
885 textBegin++; 
886
887 while (cur == 0 && textBegin != textEnd); 
888 
889 if (cur == 0
890 return
891 
892 --textBegin
893 
894 // Populate the matches list. 
895 tAssert(Matches); 
896 for (int m = 0; m < NumSubExpr; m++) 
897
898 MatchInternal& mi = Matches[m]; 
899 matches.Append(new Match(int(mi.Begin-origBegin), mi.Length)); 
900
901
902 
903 
904
905