1// tRegex.h 
2// 
3// Simple regular expression class. The code is based on TRex but has been significantly modified. The original license 
4// follows: 
5// 
6// Copyright (c) 2003-2006 Alberto Demichelis 
7// This software is provided 'as-is', without any express or implied warranty. In no event will the authors be held 
8// liable for any damages arising from the use of this software. 
9// 
10// Permission is granted to anyone to use this software for any purpose, including commercial applications, and to 
11// alter it and redistribute it freely, subject to the following restrictions: 
12// 
13// 1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. 
14// If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is 
15// not required. 
16// 
17// 2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original 
18// software. 
19// 
20// 3. This notice may not be removed or altered from any source distribution. 
21// 
22// To be absolutely clear, the tRegex class found here is an 'altered source' version of the original. The alterations 
23// are under the following license: 
24// 
25// Copyright (c) 2006, 2017 Tristan Grimmer. 
26// Permission to use, copy, modify, and/or distribute this software for any purpose with or without fee is hereby 
27// granted, provided that the above copyright notice and this permission notice appear in all copies. 
28// 
29// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL 
30// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, 
31// INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN 
32// AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 
33// PERFORMANCE OF THIS SOFTWARE. 
34 
35#pragma once 
36#include "Foundation/tStandard.h" 
37#include "Foundation/tList.h" 
38#include "Foundation/tString.h" 
39namespace tSystem 
40
41 
42 
43// The format of the regular expression pattern strings is fairly standard. The following is supported: 
44// 
45// Expressions: 
46// \ Quote the next meta-character. 
47// ^ Match the beginning of the string. 
48// . Match any character. 
49// $ Match the end of the string. 
50// | Alternation. 
51// () Grouping (creates a capture). 
52// [] Character class. 
53// 
54// Greedy Closures: 
55// * Match 0 or more times. 
56// + Match 1 or more times. 
57// ? Match 1 or 0 times. 
58// {n} Match exactly n times. 
59// {n,} Match at least n times. 
60// {n,m} Match at least n but not more than m times. 
61// 
62// Escape Characters. 
63// \t Tab (HT, TAB) 
64// \n Newline (LF, NL) 
65// \r Return (CR) 
66// \f Form feed (FF) 
67// 
68// Predefined Classes: 
69// \l Lowercase next char. 
70// \u Uppercase next char. 
71// \a Letters. 
72// \A Non-letters. 
73// \w Alphanumeric [0-9a-zA-Z]. 
74// \W Non-alphanumeric. 
75// \s Space. 
76// \S Non-space. 
77// \d Digits. 
78// \D Non-digits. 
79// \x Hexadecimal digits. 
80// \X Non-hexadecimal digits. 
81// \c Control characters. 
82// \C Non-control characters. 
83// \p Punctuation. 
84// \P Non-punctuation. 
85// \b Word boundary. 
86// \B Non-word boundary. 
87class tRegex 
88
89public
90 tRegex() : Pattern(nullptr), Nodes(nullptr), Matches(nullptr) { Clear(); } 
91 tRegex(const tString& pattern) : Pattern(nullptr), Nodes(nullptr), Matches(nullptr) { Clear(); Compile(pattern); } 
92 tRegex(const char* pattern) : Pattern(nullptr), Nodes(nullptr), Matches(nullptr) { Clear(); Compile(pattern); } 
93 ~tRegex() { Clear(); } 
94 
95 // Compiles a regular expression (described above). Any previously compiled expression is lost. 
96 void Compile(const tString& pattern); 
97 bool IsMatch(const tString& text) const { return IsMatch(text.ConstText()); } 
98 void Compile(const char* pattern); 
99 bool IsMatch(const char* text) const; // Returns true is a perfect match is attained. 
100 bool IsValid() const { return Pattern ? true : false; } 
101 void Clear(); 
102 
103 // Returns the number of sub-expressions for the compiled pattern. If all expressions match a test pattern, this is 
104 // how many matches will be returned. For example, (a)(b) will return 3. The 3 are (a)(b), (a), and (b). You do not 
105 // need a perfect match from IsMatch for this to happen. 
106 int GetNumSubExpressions() const { return IsValid() ? NumSubExpr : 0; } 
107 struct Match : public tLink<Match
108
109 Match() : IndexStart(0), Length(0) { } 
110 Match(const Match& src) : IndexStart(src.IndexStart), Length(src.Length) { } 
111 Match(int startIndex, int length) : IndexStart(startIndex), Length(length) { } 
112 bool IsValid() const { return (Length > 0) ? true : false; } 
113 tString GetString(const tString& text) const { if (!IsValid()) return tString(); tString m(Length); tStd::tStrncpy(m.Text(), text.ConstText()+IndexStart, Length); return m; } 
114 
115 int IndexStart
116 int Length
117 }; 
118 
119 // These populate the supplied list of matches. If no matches are appended to the list that means none were found. 
120 // The end pointer should be one past the last valid character to check. 
121 void Search(const char* begin, const char* end, tList<Match>&) const
122 void Search(const char* text, tList<Match>& matches) const { Search(text, text + tStd::tStrlen(text), matches); } 
123 void Search(const tString& text, tList<Match>& matches) const { Search(text.ConstText(), matches); } 
124 
125private
126 struct Node 
127
128 // These members may be an operator or the actual character. That's why they are ints and not tOperators. 
129 int Type
130 int Left
131 int Right
132 int Next
133 }; 
134 
135 struct MatchInternal 
136
137 const char* Begin
138 int Length
139 }; 
140 
141 void CompileInternal(); 
142 
143 int NewNode(int type); 
144 int ListRec(); 
145 void Expect(int n); 
146 char EscapeChar(); 
147 int CharClass(int classid); 
148 int CharNode(bool isclass); 
149 int Class(); 
150 int ParseNumber(); 
151 int Element(); 
152 static bool MatchCClass(int cclass, char c); 
153 bool MatchClass(const Node*, char c) const
154 const char* MatchNode(const Node*, const char* str, const Node* next) const
155 
156 char* Pattern; // Owned by this object. 
157 mutable const char* EOL; // End of line. 
158 mutable const char* BOL; // Beginning of line. 
159 const char* Curr
160 int First
161 Node* Nodes
162 int NumNodesAllocated
163 int NumNodes
164 int NumSubExpr
165 MatchInternal* Matches
166 mutable int CurrSubExpr
167}; 
168 
169 
170
171