Overview
  • Namespace
  • Class

Namespaces

  • apemsel
    • AttributedString

Classes

  • apemsel\AttributedString\AttributedString
  • apemsel\AttributedString\MutableAttributedString
  • apemsel\AttributedString\TokenizedAttributedString
  1 <?php
  2 namespace apemsel\AttributedString;
  3 
  4 /**
  5  * Extends AttributedString to support a tokenized string.
  6  *
  7  * You can mix working with tokens with working on string ranges using the AttributedString methods.
  8  * The original string is preserved.
  9  *
 10  * @author Adrian Pemsel <apemsel@gmail.com>
 11  */
 12 class TokenizedAttributedString extends AttributedString
 13 {
 14   protected $tokens;
 15   protected $tokenOffsets;
 16   
 17   /**
 18    * @param string|AttributedString $string String to work on
 19    * @param string $tokenizer Tokenizer to use, either "whitespace", "word" or a custom regex
 20    */
 21   public function __construct($string, $tokenizer = "whitespace") {
 22     $tokenizerFunction = "tokenizeOn".ucfirst($tokenizer);
 23 
 24     if ($tokenizer[0] == "/") {
 25       list($this->tokens, $this->tokenOffsets) = self::tokenizeOnRegex($string, $tokenizer);
 26     } else {
 27       if (!method_exists("apemsel\AttributedString\TokenizedAttributedString", $tokenizerFunction)) {
 28         throw new \InvalidArgumentException("Unknown tokenizer $tokenizer");
 29       }
 30       list($this->tokens, $this->tokenOffsets) = self::$tokenizerFunction($string);
 31     }
 32     
 33     parent::__construct($string);
 34   }
 35   
 36   /**
 37    * Return all tokens
 38    *
 39    * @return string[] tokens
 40    */
 41   public function getTokens() {
 42     return $this->tokens;
 43   }
 44   
 45   /**
 46    * Return all tokens' offsets
 47    *
 48    * @return int[] offsets
 49    */
 50   public function getTokenOffsets() {
 51     return $this->tokenOffsets;
 52   }
 53   
 54   /**
 55    * Return the number of tokens
 56    *
 57    * @return int count
 58    */
 59   public function getTokenCount() {
 60     return count($this->tokens);
 61   }
 62 
 63   /**
 64    * Get indicated token
 65    *
 66    * @param int $i token index
 67    * @return string token
 68    */
 69   public function getToken($i) {
 70     return $this->tokens[$i];
 71   }
 72   
 73   /**
 74    * Get indicated token offset
 75    *
 76    * @param int $i token index
 77    * @return int offset
 78    */
 79   public function getTokenOffset($i) {
 80     return $this->tokenOffsets[$i];
 81   }
 82   
 83   /**
 84    * Set a token to a given attribute and state
 85    *
 86    * @param int $i token index
 87    * @param string $attribute attribute name
 88    * @param bool $state attribute state
 89    */
 90   public function setTokenAttribute($i, $attribute, $state = true) {
 91     $token = $this->tokens[$i];
 92     $offset = $this->tokenOffsets[$i];
 93     $length = mb_strlen($token, "utf-8");
 94     
 95     return $this->setLength($offset, $length, $attribute, $state);
 96   }
 97   
 98   /**
 99    * Set a range of tokens to a given attribute and state
100    *
101    * @param int $from token start index
102    * @param int $to token end index
103    * @param string $attribute attribute name
104    * @param bool $state attribute state
105    */
106   public function setTokenRangeAttribute($from, $to, $attribute, $state = true) {
107     $fromOffset = $this->tokenOffsets[$from];
108     $toOffset = $this->tokenOffsets[$to] + mb_strlen($this->tokens[$to], "utf-8") - 1;
109     
110     return $this->setRange($fromOffset, $toOffset, $attribute, $state);
111   }
112   
113   /**
114    * Set all tokens matching given dictionary to attribute and state
115    *
116    * @param string[] $dictionary dictionary
117    * @param string $attribute attribute name
118    * @param bool $state attribute state
119    */
120   public function setTokenDictionaryAttribute($dictionary, $attribute, $state = true) {
121     foreach($this->tokens as $i => $token) {
122       if (in_array($token, $dictionary)) {
123         $this->setTokenAttribute($i, $attribute, $state);
124       }
125     }
126   }
127   
128   /**
129    * Get all attribute of token at given index
130    *
131    * @param int token index
132    * @return string[] attributes
133    */
134   public function attributesAtToken($i) {
135     return $this->attributesAt($this->tokenOffsets[$i]);
136   }
137   
138   /**
139    * Convert all tokens to lower case
140    */
141   public function lowercaseTokens() {
142     $this->tokens = array_map(function($token) {
143       return mb_strtolower($token, "utf-8");
144     }, $this->tokens);
145   }
146   
147   /**
148    * Tokenize a string on whitespace
149    *
150    * @param string $string string to be tokenized
151    * @return array array of two arrays, with tokens at index 0 and their offsets at index 1
152    */
153   public static function tokenizeOnWhitespace($string) {
154     // Matches pontential whitespace in front of the token and the token itself.
155     // Matching the whitespace could be omitted, but that results in slower execution ;-)
156     return self::tokenizeOnRegex($string, '/[\s\n\r]*([^\s\n\r]+)/u');
157   }
158   
159   /**
160    * Tokenize a string on words
161    *
162    * @param string $string string to be tokenized
163    * @return array array of two arrays, with tokens at index 0 and their offsets at index 1
164    */
165   public static function tokenizeOnWords($string) {
166     return self::tokenizeOnRegex($string, '/([\w]+)/u');
167   }
168   
169   /**
170    * Tokenize a string with a given regex
171    *
172    * @param string $string string to be tokenized
173    * @param string $pattern regex. The token must be captured in the first subgroup.
174    * @return array array of two arrays, with tokens at index 0 and their offsets at index 1
175    */
176   public static function tokenizeOnRegex($string, $pattern)
177   {
178     // Fastest way to get both tokens and their offsets, but not easy to understand.
179     preg_match_all($pattern, $string, $matches, PREG_OFFSET_CAPTURE);
180 
181     // $matches[1] contains an array of all matched subexpressions (= tokens)
182     // with their offset in column 1 and the matched token in column 0
183     $tokens = array_column($matches[1], 0);
184     $tokenOffsets = array_column($matches[1], 1);
185     
186     return [$tokens, $tokenOffsets];
187   }
188   
189   // Modified ArrayAccess interface
190   
191   /**
192    * Check if the token at the given index exists
193    *
194    * @param int $i token index
195    * @return bool does the offset exist
196    */
197   public function offsetExists($i) {
198     return $i < $this->getTokenCount();
199   }
200   
201   /**
202    * Get token at given index
203    *
204    * Note: TokenizedAttributedString uses the ArrayAccess interface to access tokens, not chars!
205    *
206    * @param int $i token index
207    * @return string token
208    */
209   public function offsetGet($i) {
210     return $this->tokens[$i];
211   }
212 }
213 
API documentation generated by ApiGen