1 | // tRegex.h  |
2 | //  |
3 | // Simple regular expression class. The code is based on TRex but has been significantly modified. The original license  |
4 | // follows:  |
5 | //  |
6 | // Copyright (c) 2003-2006 Alberto Demichelis  |
7 | // This software is provided 'as-is', without any express or implied warranty. In no event will the authors be held  |
8 | // liable for any damages arising from the use of this software.  |
9 | //  |
10 | // Permission is granted to anyone to use this software for any purpose, including commercial applications, and to  |
11 | // alter it and redistribute it freely, subject to the following restrictions:  |
12 | //  |
13 | // 1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software.  |
14 | // If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is  |
15 | // not required.  |
16 | //  |
17 | // 2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original  |
18 | // software.  |
19 | //  |
20 | // 3. This notice may not be removed or altered from any source distribution.  |
21 | //  |
22 | // To be absolutely clear, the tRegex class found here is an 'altered source' version of the original. The alterations  |
23 | // are under the following license:  |
24 | //  |
25 | // Copyright (c) 2006, 2017 Tristan Grimmer.  |
26 | // Permission to use, copy, modify, and/or distribute this software for any purpose with or without fee is hereby  |
27 | // granted, provided that the above copyright notice and this permission notice appear in all copies.  |
28 | //  |
29 | // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL  |
30 | // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT,  |
31 | // INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN  |
32 | // AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR  |
33 | // PERFORMANCE OF THIS SOFTWARE.  |
34 |   |
35 | #pragma once  |
36 | #include "Foundation/tStandard.h"  |
37 | #include "Foundation/tList.h"  |
38 | #include "Foundation/tString.h"  |
39 | namespace tSystem  |
40 | {  |
41 |   |
42 |   |
43 | // The format of the regular expression pattern strings is fairly standard. The following is supported:  |
44 | //  |
45 | // Expressions:  |
46 | // \ Quote the next meta-character.  |
47 | // ^ Match the beginning of the string.  |
48 | // . Match any character.  |
49 | // $ Match the end of the string.  |
50 | // | Alternation.  |
51 | // () Grouping (creates a capture).  |
52 | // [] Character class.  |
53 | //  |
54 | // Greedy Closures:  |
55 | // * Match 0 or more times.  |
56 | // + Match 1 or more times.  |
57 | // ? Match 1 or 0 times.  |
58 | // {n} Match exactly n times.  |
59 | // {n,} Match at least n times.  |
60 | // {n,m} Match at least n but not more than m times.  |
61 | //  |
62 | // Escape Characters.  |
63 | // \t Tab (HT, TAB)  |
64 | // \n Newline (LF, NL)  |
65 | // \r Return (CR)  |
66 | // \f Form feed (FF)  |
67 | //  |
68 | // Predefined Classes:  |
69 | // \l Lowercase next char.  |
70 | // \u Uppercase next char.  |
71 | // \a Letters.  |
72 | // \A Non-letters.  |
73 | // \w Alphanumeric [0-9a-zA-Z].  |
74 | // \W Non-alphanumeric.  |
75 | // \s Space.  |
76 | // \S Non-space.  |
77 | // \d Digits.  |
78 | // \D Non-digits.  |
79 | // \x Hexadecimal digits.  |
80 | // \X Non-hexadecimal digits.  |
81 | // \c Control characters.  |
82 | // \C Non-control characters.  |
83 | // \p Punctuation.  |
84 | // \P Non-punctuation.  |
85 | // \b Word boundary.  |
86 | // \B Non-word boundary.  |
87 | class tRegex  |
88 | {  |
89 | public:  |
90 | tRegex() : Pattern(nullptr), Nodes(nullptr), Matches(nullptr) { Clear(); }  |
91 | tRegex(const tString& pattern) : Pattern(nullptr), Nodes(nullptr), Matches(nullptr) { Clear(); Compile(pattern); }  |
92 | tRegex(const char* pattern) : Pattern(nullptr), Nodes(nullptr), Matches(nullptr) { Clear(); Compile(pattern); }  |
93 | ~tRegex() { Clear(); }  |
94 |   |
95 | // Compiles a regular expression (described above). Any previously compiled expression is lost.  |
96 | void Compile(const tString& pattern);  |
97 | bool IsMatch(const tString& text) const { return IsMatch(text.ConstText()); }  |
98 | void Compile(const char* pattern);  |
99 | bool IsMatch(const char* text) const; // Returns true is a perfect match is attained.  |
100 | bool IsValid() const { return Pattern ? true : false; }  |
101 | void Clear();  |
102 |   |
103 | // Returns the number of sub-expressions for the compiled pattern. If all expressions match a test pattern, this is  |
104 | // how many matches will be returned. For example, (a)(b) will return 3. The 3 are (a)(b), (a), and (b). You do not  |
105 | // need a perfect match from IsMatch for this to happen.  |
106 | int GetNumSubExpressions() const { return IsValid() ? NumSubExpr : 0; }  |
107 | struct Match : public tLink<Match>  |
108 | {  |
109 | Match() : IndexStart(0), Length(0) { }  |
110 | Match(const Match& src) : IndexStart(src.IndexStart), Length(src.Length) { }  |
111 | Match(int startIndex, int length) : IndexStart(startIndex), Length(length) { }  |
112 | bool IsValid() const { return (Length > 0) ? true : false; }  |
113 | tString GetString(const tString& text) const { if (!IsValid()) return tString(); tString m(Length); tStd::tStrncpy(m.Text(), text.ConstText()+IndexStart, Length); return m; }  |
114 |   |
115 | int IndexStart;  |
116 | int Length;  |
117 | };  |
118 |   |
119 | // These populate the supplied list of matches. If no matches are appended to the list that means none were found.  |
120 | // The end pointer should be one past the last valid character to check.  |
121 | void Search(const char* begin, const char* end, tList<Match>&) const;  |
122 | void Search(const char* text, tList<Match>& matches) const { Search(text, text + tStd::tStrlen(text), matches); }  |
123 | void Search(const tString& text, tList<Match>& matches) const { Search(text.ConstText(), matches); }  |
124 |   |
125 | private:  |
126 | struct Node  |
127 | {  |
128 | // These members may be an operator or the actual character. That's why they are ints and not tOperators.  |
129 | int Type;  |
130 | int Left;  |
131 | int Right;  |
132 | int Next;  |
133 | };  |
134 |   |
135 | struct MatchInternal  |
136 | {  |
137 | const char* Begin;  |
138 | int Length;  |
139 | };  |
140 |   |
141 | void CompileInternal();  |
142 |   |
143 | int NewNode(int type);  |
144 | int ListRec();  |
145 | void Expect(int n);  |
146 | char EscapeChar();  |
147 | int CharClass(int classid);  |
148 | int CharNode(bool isclass);  |
149 | int Class();  |
150 | int ParseNumber();  |
151 | int Element();  |
152 | static bool MatchCClass(int cclass, char c);  |
153 | bool MatchClass(const Node*, char c) const;  |
154 | const char* MatchNode(const Node*, const char* str, const Node* next) const;  |
155 |   |
156 | char* Pattern; // Owned by this object.  |
157 | mutable const char* EOL; // End of line.  |
158 | mutable const char* BOL; // Beginning of line.  |
159 | const char* Curr;  |
160 | int First;  |
161 | Node* Nodes;  |
162 | int NumNodesAllocated;  |
163 | int NumNodes;  |
164 | int NumSubExpr;  |
165 | MatchInternal* Matches;  |
166 | mutable int CurrSubExpr;  |
167 | };  |
168 |   |
169 |   |
170 | }  |
171 | |