Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
| Total | |
99.19% |
246 / 248 |
|
88.89% |
16 / 18 |
CRAP | |
0.00% |
0 / 1 |
| Js_Structure_Scanner | |
99.19% |
246 / 248 |
|
88.89% |
16 / 18 |
116 | |
0.00% |
0 / 1 |
| looks_broken | |
100.00% |
10 / 10 |
|
100.00% |
1 / 1 |
5 | |||
| __construct | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
1 | |||
| run | |
95.24% |
20 / 21 |
|
0.00% |
0 / 1 |
12 | |||
| scan_code | |
100.00% |
68 / 68 |
|
100.00% |
1 / 1 |
30 | |||
| scan_string | |
100.00% |
12 / 12 |
|
100.00% |
1 / 1 |
7 | |||
| scan_template | |
100.00% |
22 / 22 |
|
100.00% |
1 / 1 |
7 | |||
| scan_regex | |
100.00% |
26 / 26 |
|
100.00% |
1 / 1 |
12 | |||
| scan_line_comment | |
100.00% |
5 / 5 |
|
100.00% |
1 / 1 |
3 | |||
| scan_block_comment | |
100.00% |
6 / 6 |
|
100.00% |
1 / 1 |
3 | |||
| is_broken_at_eof | |
88.89% |
8 / 9 |
|
0.00% |
0 / 1 |
5.03 | |||
| closes_interpolation | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
3 | |||
| regex_allowed_here | |
100.00% |
48 / 48 |
|
100.00% |
1 / 1 |
10 | |||
| record_prev | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
1 | |||
| return_to_code | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
1 | |||
| peek | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
2 | |||
| is_ident_char | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
4 | |||
| is_ident_start | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
4 | |||
| is_exponent_at | |
100.00% |
7 / 7 |
|
100.00% |
1 / 1 |
6 | |||
| 1 | <?php |
| 2 | /** |
| 3 | * Heuristic structural validator for minified JavaScript. |
| 4 | * |
| 5 | * @link https://automattic.com |
| 6 | * @package automattic/jetpack-boost |
| 7 | */ |
| 8 | |
| 9 | namespace Automattic\Jetpack_Boost\Lib; |
| 10 | |
| 11 | /** |
| 12 | * Answers one narrow question: does a chunk of minified JS look structurally |
| 13 | * broken/truncated? It is NOT a full JS validator (PHP has none built in) -- it |
| 14 | * is a single left-to-right lexer pass that tracks string/template/regex/comment |
| 15 | * state so brackets inside those are ignored, counts bracket nesting in code, and |
| 16 | * reports "broken" when, at EOF, a string/template/regex/block-comment is still |
| 17 | * open or brackets are unbalanced. That is exactly the signature of the truncation |
| 18 | * corruption the bundled MatthiasMullie minifier produces on modern JS. |
| 19 | * |
| 20 | * It deliberately fails safe: a "broken" verdict only causes the caller to skip |
| 21 | * re-minification for that bundle, so ambiguity is cheap to get wrong. |
| 22 | * |
| 23 | * Known blind spot: corruption that stays perfectly balanced (e.g. semantic-only |
| 24 | * damage that still parses) returns "intact". Those do not crash the page and are |
| 25 | * not what this guard targets. |
| 26 | * |
| 27 | * Known (harmless) false positives: a `/` after `}` is read as a regex, not |
| 28 | * division, because telling a block `}` apart from an object-literal `}` needs a |
| 29 | * full parser; so valid object-literal division such as `({}/2)` is reported |
| 30 | * "broken". This generalizes to any object-literal `}` followed by `/` anywhere |
| 31 | * in a source file (`f({}/2)`, `[{}/2]`, `var o={a:1}/2`), and because one |
| 32 | * occurrence skips re-minification of that whole file's contribution to the |
| 33 | * bundle, the cost is more than a single statement's worth of compression. It is |
| 34 | * still fail-safe |
| 35 | * (it only skips re-minification, never corrupts output), so the heuristic stays |
| 36 | * simple rather than guess at block-vs-expression context; the new fallback |
| 37 | * observability hook lets the real-world frequency be measured. |
| 38 | */ |
| 39 | class Js_Structure_Scanner { |
| 40 | |
| 41 | /** |
| 42 | * Lexer states. |
| 43 | */ |
| 44 | private const ST_CODE = 0; |
| 45 | private const ST_STRING = 1; // Inside a '...' or "..." string. |
| 46 | private const ST_TEMPLATE = 2; // Inside a `...` template literal. |
| 47 | private const ST_REGEX = 3; // Inside a /.../ regex literal. |
| 48 | private const ST_LINE_COMMENT = 4; // Inside a // comment. |
| 49 | private const ST_BLOCK_COMMENT = 5; // Inside a /* */ comment. |
| 50 | |
| 51 | /** |
| 52 | * Upper bound (in bytes) on output we will fully scan per call. |
| 53 | * |
| 54 | * The scan only runs on a cache miss, but on hosts where the concatenation |
| 55 | * cache is not writable the _jb_static path re-minifies (and would re-scan) |
| 56 | * on every request, so the per-call cost must stay bounded. This is set high |
| 57 | * enough to cover realistic modern bundles (Tailwind/SPA/plugin-heavy output |
| 58 | * routinely runs a few MB) so the truncation signature is actually caught, |
| 59 | * rather than silently bypassed for anything above an aggressive cap. |
| 60 | * |
| 61 | * For the rare bundle larger than this, the full lexer scan is skipped, but a |
| 62 | * cheap gross-truncation backstop still runs when the original input is known |
| 63 | * (see looks_broken()). |
| 64 | */ |
| 65 | private const MAX_SCAN_BYTES = 8388608; // 8 MB. |
| 66 | |
| 67 | /** |
| 68 | * Above MAX_SCAN_BYTES we cannot afford the full scan, but a minified output |
| 69 | * that is dramatically smaller than its input is the truncation signature by |
| 70 | * itself. Real-world minification shrinks JS by ~20-40%; an output below this |
| 71 | * fraction of the original is treated as truncated. The threshold is |
| 72 | * deliberately conservative so it only trips on gross truncation, never on |
| 73 | * ordinary (even aggressive) minification. |
| 74 | * |
| 75 | * Blind spot: a size-preserving break above the cap (e.g. an unterminated |
| 76 | * template at EOF, which drops only a byte or two) leaves the ratio normal and |
| 77 | * is not caught -- and because no fallback fires, the observability hook does not |
| 78 | * surface it either. Accepted because minification runs per source file, so the |
| 79 | * cap only engages for a pathological multi-MB single source file, a vanishingly |
| 80 | * rare input for this corruption. |
| 81 | */ |
| 82 | private const TRUNCATION_RATIO = 0.5; |
| 83 | |
| 84 | /** |
| 85 | * Upper bound on simultaneous open brackets / template-interpolation frames. |
| 86 | * |
| 87 | * $stack and $frames grow with structural nesting depth, which is bounded by |
| 88 | * input bytes but not proportional to them: a small file of runaway `${`/`[`/`(` |
| 89 | * openers can amplify into hundreds of MB of frame state and trip the PHP memory |
| 90 | * limit. That fatal is NOT a \Throwable, so Minify::js()'s try/catch cannot catch |
| 91 | * it -- it would white-screen the page, the exact failure this scanner exists to |
| 92 | * prevent. Nesting this deep never occurs in legitimate output (real code nests a |
| 93 | * few levels), so reaching the cap is itself a corruption/abuse signature: the |
| 94 | * scan stops and returns "broken", routing the caller to the fail-safe original |
| 95 | * bytes. |
| 96 | * |
| 97 | * Kept to a few thousand rather than higher: the check runs after each push, so |
| 98 | * the cap also bounds peak frame state, and each interp frame is a small array. |
| 99 | * A higher cap (e.g. 100k) lets ~150 KB of runaway `${` openers build ~22 MB of |
| 100 | * state before tripping -- enough to OOM-fatal a low-memory shared host (Boost's |
| 101 | * core audience) before the cap engages, reintroducing the white-screen. At a few |
| 102 | * thousand the peak stays well under 1 MB while still sitting orders of magnitude |
| 103 | * above any real nesting. |
| 104 | */ |
| 105 | private const MAX_NESTING_DEPTH = 2000; |
| 106 | |
| 107 | /** |
| 108 | * JS being scanned and its length. |
| 109 | * |
| 110 | * @var string |
| 111 | */ |
| 112 | private $js; |
| 113 | |
| 114 | /** |
| 115 | * @var int |
| 116 | */ |
| 117 | private $len; |
| 118 | |
| 119 | /** |
| 120 | * Current scan offset. |
| 121 | * |
| 122 | * @var int |
| 123 | */ |
| 124 | private $pos = 0; |
| 125 | |
| 126 | /** |
| 127 | * Current lexer state (one of the ST_* constants). |
| 128 | * |
| 129 | * @var int |
| 130 | */ |
| 131 | private $state = self::ST_CODE; |
| 132 | |
| 133 | /** |
| 134 | * The quote character that opened the current string (' or "). |
| 135 | * |
| 136 | * @var string |
| 137 | */ |
| 138 | private $string_quote = ''; |
| 139 | |
| 140 | /** |
| 141 | * Stack of currently open brackets ('{', '(', '['). |
| 142 | * |
| 143 | * Coordinates with $frames: an `interp` frame snapshots count($stack) at |
| 144 | * push time, and closes_interpolation() uses that snapshot to tell a '}' |
| 145 | * that ends `${ ... }` apart from one that closes an ordinary block. |
| 146 | * |
| 147 | * @var string[] |
| 148 | */ |
| 149 | private $stack = array(); |
| 150 | |
| 151 | /** |
| 152 | * Template / interpolation frames. |
| 153 | * |
| 154 | * Each frame is one of: |
| 155 | * - array( 'type' => 'template' ): inside the body of a `...` literal. |
| 156 | * - array( 'type' => 'interp', 'depth' => int ): inside `${ ... }`; the |
| 157 | * 'depth' field captures count($stack) at the moment the frame was |
| 158 | * pushed, so closes_interpolation() can pair this '}' with the matching |
| 159 | * '${' rather than a sibling code block. See scan_template() and |
| 160 | * closes_interpolation(). |
| 161 | * |
| 162 | * @var array[] |
| 163 | */ |
| 164 | private $frames = array(); |
| 165 | |
| 166 | /** |
| 167 | * True once an extra or mismatched closing bracket has been seen. |
| 168 | * |
| 169 | * @var bool |
| 170 | */ |
| 171 | private $unmatched = false; |
| 172 | |
| 173 | /** |
| 174 | * True while inside a regex [...] character class. |
| 175 | * |
| 176 | * @var bool |
| 177 | */ |
| 178 | private $re_class = false; |
| 179 | |
| 180 | /** |
| 181 | * Last significant char seen in code state. |
| 182 | * |
| 183 | * @var string |
| 184 | */ |
| 185 | private $prev = ''; |
| 186 | |
| 187 | /** |
| 188 | * The significant char before $prev. |
| 189 | * |
| 190 | * @var string |
| 191 | */ |
| 192 | private $prev_prev = ''; |
| 193 | |
| 194 | /** |
| 195 | * Last identifier/keyword seen in code state. |
| 196 | * |
| 197 | * @var string |
| 198 | */ |
| 199 | private $prev_word = ''; |
| 200 | |
| 201 | /** |
| 202 | * Whether $prev_word was a member access (immediately after a '.'). |
| 203 | * |
| 204 | * @var bool |
| 205 | */ |
| 206 | private $prev_dot = false; |
| 207 | |
| 208 | /** |
| 209 | * Whether the given minified JS looks structurally broken/truncated. |
| 210 | * |
| 211 | * @since 4.6.0 |
| 212 | * |
| 213 | * @param string $js Minified JS to inspect. |
| 214 | * @param string|null $original_js The pre-minification input, when available. Used |
| 215 | * only for the gross-truncation backstop on inputs |
| 216 | * too large to scan in full (see MAX_SCAN_BYTES). |
| 217 | * |
| 218 | * @return bool True if it looks broken; false if it looks intact. |
| 219 | */ |
| 220 | public static function looks_broken( $js, $original_js = null ) { |
| 221 | $js = (string) $js; |
| 222 | |
| 223 | // Empty output is handled by the caller. |
| 224 | if ( '' === $js ) { |
| 225 | return false; |
| 226 | } |
| 227 | |
| 228 | // Output larger than the scan budget is not lexed in full. Fall back to a |
| 229 | // cheap size-delta check against the original: a minified output far smaller |
| 230 | // than its input is the truncation signature even without a full scan. With |
| 231 | // no original to compare against, assume intact. |
| 232 | $len = strlen( $js ); |
| 233 | if ( $len > self::MAX_SCAN_BYTES ) { |
| 234 | if ( null === $original_js ) { |
| 235 | return false; |
| 236 | } |
| 237 | $original_len = strlen( (string) $original_js ); |
| 238 | return $original_len > 0 && $len < ( $original_len * self::TRUNCATION_RATIO ); |
| 239 | } |
| 240 | |
| 241 | return ( new self( $js ) )->run(); |
| 242 | } |
| 243 | |
| 244 | /** |
| 245 | * @param string $js Minified JS to inspect. |
| 246 | */ |
| 247 | private function __construct( $js ) { |
| 248 | $this->js = $js; |
| 249 | $this->len = strlen( $js ); |
| 250 | } |
| 251 | |
| 252 | /** |
| 253 | * Run the scan and return the verdict. |
| 254 | * |
| 255 | * @return bool |
| 256 | */ |
| 257 | private function run() { |
| 258 | while ( $this->pos < $this->len ) { |
| 259 | switch ( $this->state ) { |
| 260 | case self::ST_CODE: |
| 261 | $broken = $this->scan_code(); |
| 262 | break; |
| 263 | case self::ST_STRING: |
| 264 | $broken = $this->scan_string(); |
| 265 | break; |
| 266 | case self::ST_TEMPLATE: |
| 267 | $broken = $this->scan_template(); |
| 268 | break; |
| 269 | case self::ST_REGEX: |
| 270 | $broken = $this->scan_regex(); |
| 271 | break; |
| 272 | case self::ST_LINE_COMMENT: |
| 273 | $broken = $this->scan_line_comment(); |
| 274 | break; |
| 275 | case self::ST_BLOCK_COMMENT: |
| 276 | $broken = $this->scan_block_comment(); |
| 277 | break; |
| 278 | default: |
| 279 | return true; // Unknown state: fail safe. |
| 280 | } |
| 281 | if ( $broken ) { |
| 282 | return true; |
| 283 | } |
| 284 | |
| 285 | // Structural nesting this deep never occurs in legitimate output; stop |
| 286 | // before unbounded $stack/$frames growth can exhaust memory (an |
| 287 | // uncatchable fatal). The depth itself is a corruption signature, so the |
| 288 | // fail-safe verdict is "broken". |
| 289 | if ( count( $this->stack ) > self::MAX_NESTING_DEPTH |
| 290 | || count( $this->frames ) > self::MAX_NESTING_DEPTH ) { |
| 291 | return true; |
| 292 | } |
| 293 | } |
| 294 | |
| 295 | return $this->is_broken_at_eof(); |
| 296 | } |
| 297 | |
| 298 | /** |
| 299 | * Scan one step while in code state. |
| 300 | * |
| 301 | * @return bool True to short-circuit as broken. |
| 302 | */ |
| 303 | private function scan_code() { |
| 304 | $c = $this->js[ $this->pos ]; |
| 305 | |
| 306 | // `true`/`false` shortened to !0/!1 before a member access (e.g. |
| 307 | // true.toString() -> !0.toString()) is invalid: `0.` is a numeric literal, |
| 308 | // so the following identifier is a syntax error. This stays bracket-balanced, |
| 309 | // so check it explicitly. Excludes the exponent case: `0.e5` / `0.e+5` is a |
| 310 | // valid numeric literal, so `!0.e5` is valid and must not be flagged. |
| 311 | if ( '!' === $c |
| 312 | && ( '0' === $this->peek( 1 ) || '1' === $this->peek( 1 ) ) |
| 313 | && '.' === $this->peek( 2 ) |
| 314 | && self::is_ident_start( $this->peek( 3 ) ) |
| 315 | && ! $this->is_exponent_at( 3 ) ) { |
| 316 | return true; |
| 317 | } |
| 318 | |
| 319 | if ( '/' === $c ) { |
| 320 | $next = $this->peek(); |
| 321 | if ( '/' === $next ) { |
| 322 | $this->state = self::ST_LINE_COMMENT; |
| 323 | $this->pos += 2; |
| 324 | return false; |
| 325 | } |
| 326 | if ( '*' === $next ) { |
| 327 | $this->state = self::ST_BLOCK_COMMENT; |
| 328 | $this->pos += 2; |
| 329 | return false; |
| 330 | } |
| 331 | if ( $this->regex_allowed_here() ) { |
| 332 | $this->state = self::ST_REGEX; |
| 333 | $this->re_class = false; |
| 334 | } |
| 335 | $this->record_prev( '/' ); |
| 336 | ++$this->pos; |
| 337 | return false; |
| 338 | } |
| 339 | |
| 340 | if ( "'" === $c || '"' === $c ) { |
| 341 | $this->state = self::ST_STRING; |
| 342 | $this->string_quote = $c; |
| 343 | ++$this->pos; |
| 344 | return false; |
| 345 | } |
| 346 | |
| 347 | if ( '`' === $c ) { |
| 348 | $this->frames[] = array( 'type' => 'template' ); |
| 349 | $this->state = self::ST_TEMPLATE; |
| 350 | ++$this->pos; |
| 351 | return false; |
| 352 | } |
| 353 | |
| 354 | if ( '{' === $c || '(' === $c || '[' === $c ) { |
| 355 | $this->stack[] = $c; |
| 356 | $this->record_prev( $c ); |
| 357 | ++$this->pos; |
| 358 | return false; |
| 359 | } |
| 360 | |
| 361 | if ( '}' === $c || ')' === $c || ']' === $c ) { |
| 362 | // A '}' may close a template interpolation (${ ... }) rather than a |
| 363 | // code block: the interpolation pushed its own '{' on the stack, so this |
| 364 | // '}' closes the interpolation when that '{' is the current stack top. |
| 365 | if ( '}' === $c && $this->closes_interpolation() ) { |
| 366 | array_pop( $this->frames ); |
| 367 | array_pop( $this->stack ); // The '{' from ${. |
| 368 | $this->state = self::ST_TEMPLATE; |
| 369 | $this->record_prev( '`' ); |
| 370 | ++$this->pos; |
| 371 | return false; |
| 372 | } |
| 373 | |
| 374 | $expected = ( '}' === $c ) ? '{' : ( ( ')' === $c ) ? '(' : '[' ); |
| 375 | if ( empty( $this->stack ) || end( $this->stack ) !== $expected ) { |
| 376 | $this->unmatched = true; |
| 377 | } else { |
| 378 | array_pop( $this->stack ); |
| 379 | } |
| 380 | $this->record_prev( $c ); |
| 381 | ++$this->pos; |
| 382 | return false; |
| 383 | } |
| 384 | |
| 385 | if ( ctype_space( $c ) ) { |
| 386 | ++$this->pos; |
| 387 | return false; |
| 388 | } |
| 389 | |
| 390 | if ( self::is_ident_char( $c ) ) { |
| 391 | $start = $this->pos; |
| 392 | while ( $this->pos < $this->len && self::is_ident_char( $this->js[ $this->pos ] ) ) { |
| 393 | ++$this->pos; |
| 394 | } |
| 395 | $this->prev_dot = ( '.' === $this->prev ); |
| 396 | $this->prev_prev = $this->prev; |
| 397 | $this->prev = $this->js[ $this->pos - 1 ]; |
| 398 | $this->prev_word = substr( $this->js, $start, $this->pos - $start ); |
| 399 | return false; |
| 400 | } |
| 401 | |
| 402 | // Any other significant char (operators like + - * . etc.). |
| 403 | $this->record_prev( $c ); |
| 404 | ++$this->pos; |
| 405 | return false; |
| 406 | } |
| 407 | |
| 408 | /** |
| 409 | * Scan one step while inside a '...' or "..." string. |
| 410 | * |
| 411 | * @return bool True to short-circuit as broken (unterminated string). |
| 412 | */ |
| 413 | private function scan_string() { |
| 414 | $c = $this->js[ $this->pos ]; |
| 415 | if ( '\\' === $c ) { |
| 416 | // Escape sequence, including a line continuation (`\` then a line |
| 417 | // terminator). A CRLF continuation is three bytes (`\` + CR + LF), so |
| 418 | // consume the trailing LF too -- otherwise it is left at the new position |
| 419 | // and trips the raw-newline check below, falsely flagging a valid |
| 420 | // continuation as broken. A lone CR or LF after `\` is the normal 2-byte |
| 421 | // skip. |
| 422 | if ( "\r" === $this->peek() && "\n" === $this->peek( 2 ) ) { |
| 423 | $this->pos += 3; |
| 424 | } else { |
| 425 | $this->pos += 2; |
| 426 | } |
| 427 | return false; |
| 428 | } |
| 429 | // A raw LF/CR inside a string literal is a syntax error (a real newline must |
| 430 | // be escaped or a line continuation), so it never appears in valid minified |
| 431 | // output -- only in truncated/corrupted output where the closing quote was |
| 432 | // lost and a later quote happened to re-open the state. Checking ASCII LF/CR |
| 433 | // is zero-false-positive and keeps the byte-oriented lexer simple. |
| 434 | if ( "\n" === $c || "\r" === $c ) { |
| 435 | return true; // Unterminated string literal. |
| 436 | } |
| 437 | if ( $c === $this->string_quote ) { |
| 438 | $this->return_to_code( $c ); |
| 439 | } |
| 440 | ++$this->pos; |
| 441 | return false; |
| 442 | } |
| 443 | |
| 444 | /** |
| 445 | * Scan one step while inside a `...` template literal. |
| 446 | * |
| 447 | * @return bool |
| 448 | */ |
| 449 | private function scan_template() { |
| 450 | $c = $this->js[ $this->pos ]; |
| 451 | if ( '\\' === $c ) { |
| 452 | $this->pos += 2; |
| 453 | return false; |
| 454 | } |
| 455 | if ( '`' === $c ) { |
| 456 | $top = end( $this->frames ); |
| 457 | if ( $top && 'template' === $top['type'] ) { |
| 458 | array_pop( $this->frames ); |
| 459 | } |
| 460 | $this->return_to_code( '`' ); |
| 461 | ++$this->pos; |
| 462 | return false; |
| 463 | } |
| 464 | if ( '$' === $c && '{' === $this->peek() ) { |
| 465 | $this->frames[] = array( |
| 466 | 'type' => 'interp', |
| 467 | 'depth' => count( $this->stack ), |
| 468 | ); |
| 469 | $this->stack[] = '{'; // The '{' from ${. |
| 470 | $this->return_to_code( '{' ); |
| 471 | $this->pos += 2; |
| 472 | return false; |
| 473 | } |
| 474 | ++$this->pos; |
| 475 | return false; |
| 476 | } |
| 477 | |
| 478 | /** |
| 479 | * Scan one step while inside a /.../ regex literal. |
| 480 | * |
| 481 | * @return bool True to short-circuit as broken (unterminated regex). |
| 482 | */ |
| 483 | private function scan_regex() { |
| 484 | $c = $this->js[ $this->pos ]; |
| 485 | if ( '\\' === $c ) { |
| 486 | // Unlike a string (where `\<LF>` is a valid line continuation), a line |
| 487 | // terminator cannot be escaped inside a regex literal -- a backslash |
| 488 | // immediately before a raw LF/CR is the truncation signature. Check the |
| 489 | // escaped byte before skipping over the pair. |
| 490 | $escaped = $this->peek(); |
| 491 | if ( "\n" === $escaped || "\r" === $escaped ) { |
| 492 | return true; // Unterminated regex literal. |
| 493 | } |
| 494 | $this->pos += 2; |
| 495 | return false; |
| 496 | } |
| 497 | // A raw line terminator (LF or CR) is invalid anywhere in a regex literal, |
| 498 | // including inside a [...] character class, so this check sits above the |
| 499 | // re_class branch. |
| 500 | if ( "\n" === $c || "\r" === $c ) { |
| 501 | return true; // Unterminated regex literal. |
| 502 | } |
| 503 | if ( $this->re_class ) { |
| 504 | if ( ']' === $c ) { |
| 505 | $this->re_class = false; |
| 506 | } |
| 507 | ++$this->pos; |
| 508 | return false; |
| 509 | } |
| 510 | if ( '[' === $c ) { |
| 511 | $this->re_class = true; |
| 512 | ++$this->pos; |
| 513 | return false; |
| 514 | } |
| 515 | if ( '/' === $c ) { |
| 516 | $this->return_to_code( '/' ); |
| 517 | ++$this->pos; |
| 518 | while ( $this->pos < $this->len && ctype_alpha( $this->js[ $this->pos ] ) ) { |
| 519 | ++$this->pos; // Skip regex flags. |
| 520 | } |
| 521 | return false; |
| 522 | } |
| 523 | ++$this->pos; |
| 524 | return false; |
| 525 | } |
| 526 | |
| 527 | /** |
| 528 | * Scan one step while inside a // line comment. |
| 529 | * |
| 530 | * @return bool |
| 531 | */ |
| 532 | private function scan_line_comment() { |
| 533 | // CR, LF, and CRLF all end a line comment in JS; a bare CR must close it too, |
| 534 | // otherwise code after a CR-only line ending is swallowed and broken input |
| 535 | // can read as intact. (CRLF closes on the CR; the trailing LF is then a |
| 536 | // no-op space in code state.) |
| 537 | $c = $this->js[ $this->pos ]; |
| 538 | if ( "\n" === $c || "\r" === $c ) { |
| 539 | $this->state = self::ST_CODE; |
| 540 | } |
| 541 | ++$this->pos; |
| 542 | return false; |
| 543 | } |
| 544 | |
| 545 | /** |
| 546 | * Scan one step while inside a block comment. |
| 547 | * |
| 548 | * @return bool |
| 549 | */ |
| 550 | private function scan_block_comment() { |
| 551 | if ( '*' === $this->js[ $this->pos ] && '/' === $this->peek() ) { |
| 552 | $this->state = self::ST_CODE; |
| 553 | $this->pos += 2; |
| 554 | return false; |
| 555 | } |
| 556 | ++$this->pos; |
| 557 | return false; |
| 558 | } |
| 559 | |
| 560 | /** |
| 561 | * Verdict once the whole input has been consumed. |
| 562 | * |
| 563 | * @return bool |
| 564 | */ |
| 565 | private function is_broken_at_eof() { |
| 566 | // A line comment running to EOF is valid (e.g. a trailing |
| 567 | // `//# sourceMappingURL=...` with no final newline). Every other open state |
| 568 | // at EOF is a genuinely unterminated construct. |
| 569 | if ( self::ST_LINE_COMMENT === $this->state ) { |
| 570 | $this->state = self::ST_CODE; |
| 571 | } |
| 572 | |
| 573 | if ( self::ST_CODE !== $this->state ) { |
| 574 | return true; // Unterminated string/template/regex/block-comment. |
| 575 | } |
| 576 | if ( ! empty( $this->stack ) ) { |
| 577 | return true; // Unbalanced brackets. |
| 578 | } |
| 579 | if ( ! empty( $this->frames ) ) { |
| 580 | return true; // Unterminated template/interpolation. |
| 581 | } |
| 582 | |
| 583 | return $this->unmatched; // Saw an extra/mismatched closing bracket. |
| 584 | } |
| 585 | |
| 586 | /** |
| 587 | * Whether the current '}' closes a template interpolation rather than a block. |
| 588 | * |
| 589 | * @return bool |
| 590 | */ |
| 591 | private function closes_interpolation() { |
| 592 | $top = end( $this->frames ); |
| 593 | return $top && 'interp' === $top['type'] && ( $top['depth'] + 1 ) === count( $this->stack ); |
| 594 | } |
| 595 | |
| 596 | /** |
| 597 | * Whether a `/` at the current position begins a regex literal (vs division). |
| 598 | * |
| 599 | * JavaScript cannot decide this locally; lexers approximate it from the |
| 600 | * previous significant token: after a value (identifier, number, `)`, `]`, |
| 601 | * postfix ++/--) `/` is division; after an operator/keyword that expects an |
| 602 | * operand next, `/` begins a regex. |
| 603 | * |
| 604 | * @return bool |
| 605 | */ |
| 606 | private function regex_allowed_here() { |
| 607 | $prev = $this->prev; |
| 608 | |
| 609 | if ( '' === $prev ) { |
| 610 | return true; // Start of input. |
| 611 | } |
| 612 | |
| 613 | // Postfix ++ / -- yields a value, so a following `/` is division. |
| 614 | if ( ( '+' === $prev && '+' === $this->prev_prev ) || ( '-' === $prev && '-' === $this->prev_prev ) ) { |
| 615 | return false; |
| 616 | } |
| 617 | |
| 618 | // Punctuation/operators after which a regex literal is legal. Includes the |
| 619 | // arithmetic/comparison binary operators so real minified expressions like |
| 620 | // `x+"|"+/\d{1,2}/.source` (regex after a binary +) are not misread. |
| 621 | static $puncts = array( |
| 622 | '(' => 1, |
| 623 | ',' => 1, |
| 624 | '=' => 1, |
| 625 | ':' => 1, |
| 626 | '[' => 1, |
| 627 | '!' => 1, |
| 628 | '&' => 1, |
| 629 | '|' => 1, |
| 630 | '?' => 1, |
| 631 | '{' => 1, |
| 632 | ';' => 1, |
| 633 | '}' => 1, |
| 634 | '+' => 1, |
| 635 | '-' => 1, |
| 636 | '*' => 1, |
| 637 | '%' => 1, |
| 638 | '<' => 1, |
| 639 | '>' => 1, |
| 640 | '~' => 1, |
| 641 | '^' => 1, |
| 642 | ); |
| 643 | if ( isset( $puncts[ $prev ] ) ) { |
| 644 | return true; |
| 645 | } |
| 646 | |
| 647 | // After a word: only the keywords below allow a regex, and only when the |
| 648 | // word is not a property access (e.g. `a.return/b` is division). |
| 649 | if ( self::is_ident_char( $prev ) ) { |
| 650 | static $kw = array( |
| 651 | 'return' => 1, |
| 652 | 'throw' => 1, |
| 653 | 'typeof' => 1, |
| 654 | 'in' => 1, |
| 655 | 'of' => 1, |
| 656 | 'new' => 1, |
| 657 | 'do' => 1, |
| 658 | 'else' => 1, |
| 659 | 'void' => 1, |
| 660 | 'delete' => 1, |
| 661 | 'instanceof' => 1, |
| 662 | 'case' => 1, |
| 663 | 'yield' => 1, |
| 664 | 'await' => 1, |
| 665 | ); |
| 666 | return ! $this->prev_dot && '' !== $this->prev_word && isset( $kw[ $this->prev_word ] ); |
| 667 | } |
| 668 | |
| 669 | // Closing bracket ) ], a '.', etc. -> division. |
| 670 | return false; |
| 671 | } |
| 672 | |
| 673 | /** |
| 674 | * Record the most recent significant token (without changing state). |
| 675 | * |
| 676 | * @param string $char The token char. |
| 677 | */ |
| 678 | private function record_prev( $char ) { |
| 679 | $this->prev_prev = $this->prev; |
| 680 | $this->prev = $char; |
| 681 | $this->prev_word = ''; |
| 682 | $this->prev_dot = false; |
| 683 | } |
| 684 | |
| 685 | /** |
| 686 | * Return to code state, recording $char as the most recent token. |
| 687 | * |
| 688 | * @param string $char The token char (the closing delimiter / interpolation brace). |
| 689 | */ |
| 690 | private function return_to_code( $char ) { |
| 691 | $this->state = self::ST_CODE; |
| 692 | $this->record_prev( $char ); |
| 693 | } |
| 694 | |
| 695 | /** |
| 696 | * Look ahead from the current position. |
| 697 | * |
| 698 | * @param int $offset How far ahead to peek (default the next char). |
| 699 | * |
| 700 | * @return string The char, or '' if out of range. |
| 701 | */ |
| 702 | private function peek( $offset = 1 ) { |
| 703 | $i = $this->pos + $offset; |
| 704 | return ( $i < $this->len ) ? $this->js[ $i ] : ''; |
| 705 | } |
| 706 | |
| 707 | /** |
| 708 | * Whether $c can appear within a JS identifier. |
| 709 | * |
| 710 | * @param string $c Single character. |
| 711 | * @return bool |
| 712 | */ |
| 713 | private static function is_ident_char( $c ) { |
| 714 | return '' !== $c && ( ctype_alnum( $c ) || '_' === $c || '$' === $c ); |
| 715 | } |
| 716 | |
| 717 | /** |
| 718 | * Whether $c can start a JS identifier. |
| 719 | * |
| 720 | * @param string $c Single character. |
| 721 | * @return bool |
| 722 | */ |
| 723 | private static function is_ident_start( $c ) { |
| 724 | return '' !== $c && ( ctype_alpha( $c ) || '_' === $c || '$' === $c ); |
| 725 | } |
| 726 | |
| 727 | /** |
| 728 | * Whether the chars at $offset begin a valid exponent part (the `e5` of the |
| 729 | * numeric literal `0.e5`). Lets the !0/!1 member-access check tell a real, |
| 730 | * broken member access (`!0.toString()`) apart from a valid exponent |
| 731 | * literal (`!0.e5`), which is not a member access at all. |
| 732 | * |
| 733 | * @param int $offset Offset from the current position. |
| 734 | * @return bool |
| 735 | */ |
| 736 | private function is_exponent_at( $offset ) { |
| 737 | $c = $this->peek( $offset ); |
| 738 | if ( 'e' !== $c && 'E' !== $c ) { |
| 739 | return false; |
| 740 | } |
| 741 | $next = $this->peek( $offset + 1 ); |
| 742 | if ( '+' === $next || '-' === $next ) { |
| 743 | $next = $this->peek( $offset + 2 ); |
| 744 | } |
| 745 | return '' !== $next && ctype_digit( $next ); |
| 746 | } |
| 747 | } |