1 <?php
2 namespace apemsel\AttributedString;
3
4 /**
5 * Extends AttributedString to support a tokenized string.
6 *
7 * You can mix working with tokens with working on string ranges using the AttributedString methods.
8 * The original string is preserved.
9 *
10 * @author Adrian Pemsel <apemsel@gmail.com>
11 */
12 class TokenizedAttributedString extends AttributedString
13 {
14 protected $tokens;
15 protected $tokenOffsets;
16
17 /**
18 * @param string|AttributedString $string String to work on
19 * @param string $tokenizer Tokenizer to use, either "whitespace", "word" or a custom regex
20 */
21 public function __construct($string, $tokenizer = "whitespace") {
22 $tokenizerFunction = "tokenizeOn".ucfirst($tokenizer);
23
24 if ($tokenizer[0] == "/") {
25 list($this->tokens, $this->tokenOffsets) = self::tokenizeOnRegex($string, $tokenizer);
26 } else {
27 if (!method_exists("apemsel\AttributedString\TokenizedAttributedString", $tokenizerFunction)) {
28 throw new \InvalidArgumentException("Unknown tokenizer $tokenizer");
29 }
30 list($this->tokens, $this->tokenOffsets) = self::$tokenizerFunction($string);
31 }
32
33 parent::__construct($string);
34 }
35
36 /**
37 * Return all tokens
38 *
39 * @return string[] tokens
40 */
41 public function getTokens() {
42 return $this->tokens;
43 }
44
45 /**
46 * Return all tokens' offsets
47 *
48 * @return int[] offsets
49 */
50 public function getTokenOffsets() {
51 return $this->tokenOffsets;
52 }
53
54 /**
55 * Return the number of tokens
56 *
57 * @return int count
58 */
59 public function getTokenCount() {
60 return count($this->tokens);
61 }
62
63 /**
64 * Get indicated token
65 *
66 * @param int $i token index
67 * @return string token
68 */
69 public function getToken($i) {
70 return $this->tokens[$i];
71 }
72
73 /**
74 * Get indicated token offset
75 *
76 * @param int $i token index
77 * @return int offset
78 */
79 public function getTokenOffset($i) {
80 return $this->tokenOffsets[$i];
81 }
82
83 /**
84 * Set a token to a given attribute and state
85 *
86 * @param int $i token index
87 * @param string $attribute attribute name
88 * @param bool $state attribute state
89 */
90 public function setTokenAttribute($i, $attribute, $state = true) {
91 $token = $this->tokens[$i];
92 $offset = $this->tokenOffsets[$i];
93 $length = mb_strlen($token, "utf-8");
94
95 return $this->setLength($offset, $length, $attribute, $state);
96 }
97
98 /**
99 * Set a range of tokens to a given attribute and state
100 *
101 * @param int $from token start index
102 * @param int $to token end index
103 * @param string $attribute attribute name
104 * @param bool $state attribute state
105 */
106 public function setTokenRangeAttribute($from, $to, $attribute, $state = true) {
107 $fromOffset = $this->tokenOffsets[$from];
108 $toOffset = $this->tokenOffsets[$to] + mb_strlen($this->tokens[$to], "utf-8") - 1;
109
110 return $this->setRange($fromOffset, $toOffset, $attribute, $state);
111 }
112
113 /**
114 * Set all tokens matching given dictionary to attribute and state
115 *
116 * @param string[] $dictionary dictionary
117 * @param string $attribute attribute name
118 * @param bool $state attribute state
119 */
120 public function setTokenDictionaryAttribute($dictionary, $attribute, $state = true) {
121 foreach($this->tokens as $i => $token) {
122 if (in_array($token, $dictionary)) {
123 $this->setTokenAttribute($i, $attribute, $state);
124 }
125 }
126 }
127
128 /**
129 * Get all attribute of token at given index
130 *
131 * @param int token index
132 * @return string[] attributes
133 */
134 public function attributesAtToken($i) {
135 return $this->attributesAt($this->tokenOffsets[$i]);
136 }
137
138 /**
139 * Convert all tokens to lower case
140 */
141 public function lowercaseTokens() {
142 $this->tokens = array_map(function($token) {
143 return mb_strtolower($token, "utf-8");
144 }, $this->tokens);
145 }
146
147 /**
148 * Tokenize a string on whitespace
149 *
150 * @param string $string string to be tokenized
151 * @return array array of two arrays, with tokens at index 0 and their offsets at index 1
152 */
153 public static function tokenizeOnWhitespace($string) {
154 // Matches pontential whitespace in front of the token and the token itself.
155 // Matching the whitespace could be omitted, but that results in slower execution ;-)
156 return self::tokenizeOnRegex($string, '/[\s\n\r]*([^\s\n\r]+)/u');
157 }
158
159 /**
160 * Tokenize a string on words
161 *
162 * @param string $string string to be tokenized
163 * @return array array of two arrays, with tokens at index 0 and their offsets at index 1
164 */
165 public static function tokenizeOnWords($string) {
166 return self::tokenizeOnRegex($string, '/([\w]+)/u');
167 }
168
169 /**
170 * Tokenize a string with a given regex
171 *
172 * @param string $string string to be tokenized
173 * @param string $pattern regex. The token must be captured in the first subgroup.
174 * @return array array of two arrays, with tokens at index 0 and their offsets at index 1
175 */
176 public static function tokenizeOnRegex($string, $pattern)
177 {
178 // Fastest way to get both tokens and their offsets, but not easy to understand.
179 preg_match_all($pattern, $string, $matches, PREG_OFFSET_CAPTURE);
180
181 // $matches[1] contains an array of all matched subexpressions (= tokens)
182 // with their offset in column 1 and the matched token in column 0
183 $tokens = array_column($matches[1], 0);
184 $tokenOffsets = array_column($matches[1], 1);
185
186 return [$tokens, $tokenOffsets];
187 }
188
189 // Modified ArrayAccess interface
190
191 /**
192 * Check if the token at the given index exists
193 *
194 * @param int $i token index
195 * @return bool does the offset exist
196 */
197 public function offsetExists($i) {
198 return $i < $this->getTokenCount();
199 }
200
201 /**
202 * Get token at given index
203 *
204 * Note: TokenizedAttributedString uses the ArrayAccess interface to access tokens, not chars!
205 *
206 * @param int $i token index
207 * @return string token
208 */
209 public function offsetGet($i) {
210 return $this->tokens[$i];
211 }
212 }
213