Code Coverage
 
Lines
Functions and Methods
Classes and Traits
Total
0.00% covered (danger)
0.00%
0 / 91
0.00% covered (danger)
0.00%
0 / 1
CRAP
0.00% covered (danger)
0.00%
0 / 1
Code_Block_HTML_Replacer
0.00% covered (danger)
0.00%
0 / 91
0.00% covered (danger)
0.00%
0 / 1
702
0.00% covered (danger)
0.00%
0 / 1
 get_updated_html_with_replaced_content
0.00% covered (danger)
0.00%
0 / 91
0.00% covered (danger)
0.00%
0 / 1
702
1<?php
2/**
3 * HTML Replacer class for the Code Block.
4 *
5 * @package automattic/jetpack-mu-wpcom
6 */
7
8declare( strict_types = 1 );
9
10namespace Automattic\Jetpack;
11
12use WP_HTML_Processor;
13use WP_HTML_Text_Replacement;
14
15/**
16 * Safely replace block HTML content with tokenized HTML.
17 */
18class Code_Block_HTML_Replacer extends WP_HTML_Processor {
19    /**
20     * Replace the code block content with the tokenize HTML.
21     *
22     * This extracts the original code text and provides the updated HTML string
23     * with the tokenized HTML inserted. The HTML structure and replacement
24     * contents are checked to ensure safety.
25     *
26     * @param string      $html The HTML string containing the code block.
27     * @param array       $tokenized_code_data The tokenized code data.
28     * @param string|null $language_name The language name, if any.
29     * @return null|array{0: string, 1: string} Null on failure, or array with original code string
30     *                                          and the tokenized HTML markup.
31     */
32    public static function get_updated_html_with_replaced_content( string $html, array $tokenized_code_data, ?string $language_name ): ?array {
33        $processor = self::create_fragment( $html );
34
35        // Skip leading whitespace
36        while (
37            $processor->next_token()
38            && $processor->get_token_type() === '#text'
39            && $processor->text_node_classification === self::TEXT_IS_WHITESPACE
40        ) {
41            continue;
42        }
43
44        // The serialized PRE tag has block wrapper attributes.
45        // Remove them, they'll be applied in a wrapper.
46        if ( $processor->get_tag() !== 'PRE' ) {
47            return null;
48        } else {
49            $processor->set_bookmark( 'pre_open' );
50        }
51
52        // The next token should be the CODE tag opener.
53        if ( ! $processor->next_token() || $processor->get_tag() !== 'CODE' ) {
54            return null;
55        }
56
57        if ( $language_name ) {
58            $processor->add_class(
59                'language-' .
60                \strtr(
61                    \strtolower( $language_name ),
62                    array(
63                        ' '  => '_',
64                        "\t" => '_',
65                        "\n" => '_',
66                        "\r" => '_',
67                        "\f" => '_',
68                    )
69                )
70            );
71            $processor->get_updated_html();
72        }
73        $processor->set_bookmark( 'code_content_start' );
74
75        /*
76         * The code should be 1 HTML CODE element containing the text.
77         * <code>### text ###</code>.
78         * OR it can be an empty CODE element:
79         * <code></code>
80         */
81        if ( ! $processor->next_token() ) {
82            return null;
83        }
84        if ( $processor->get_token_type() === '#text' ) {
85            $code_string = $processor->get_modifiable_text();
86            if ( ! $processor->next_token() ) {
87                return null;
88            }
89        } else {
90            $code_string = '';
91        }
92
93        // This must be the closing CODE tag of <code>…text…</code> or empty <code></code>.
94        if ( $processor->get_tag() !== 'CODE' || ! $processor->is_tag_closer() ) {
95            return null;
96        }
97        $processor->set_bookmark( 'code_content_end' );
98
99        if (
100            ! isset( $processor->bookmarks['_pre_open'] ) ||
101            ! isset( $processor->bookmarks['_code_content_start'] ) ||
102            ! isset( $processor->bookmarks['_code_content_end'] )
103        ) {
104            return null;
105        }
106
107        $replacement_code_html = array();
108        foreach ( $tokenized_code_data as $line ) {
109            $replacement_code_html[] = '<div class="cm-line">';
110            foreach ( $line as $chunk ) {
111                if (
112                    ! \is_array( $chunk ) ||
113                    ! isset( $chunk[0] ) ||
114                    ! \is_string( $chunk[0] ) ||
115                    ( isset( $chunk[1] ) && ! \is_string( $chunk[1] ) )
116                ) {
117                    return null;
118                }
119
120                // phpcs:ignore WordPress.PHP.DiscouragedPHPFunctions.obfuscation_base64_decode
121                $code = base64_decode( $chunk[0], true );
122                if ( false === $code ) {
123                    return null;
124                }
125
126                $class_name = $chunk[1] ?? null;
127
128                /*
129                 * Do not rely on `esc_html`. It would mishandle character references
130                 * that appear to be encoded already. HTML like `&amp;` would be
131                 * ignored, resulting in `&` rendering in the browser instead of the
132                 * desired `&amp;` which must be HTML encoded as `&amp;amp;`.
133                 *
134                 * - ENT_NOQUOTES: Quote characters do not require encoding in HTML text nodes.
135                 * - ENT_SUBSTITUTE: Replace invalid code unit sequences with a Unicode
136                 *                   substitution character. This is unexpected, but safe.
137                 * - 'UTF-8' The base64 encoding from JavaScript is UTF-8.
138                 * - true: Force HTML character references to be used for `&`, `<`, `>`
139                 *         in the input string.
140                 */
141                $html_encoded_code = htmlspecialchars(
142                    $code,
143                    ENT_NOQUOTES | ENT_SUBSTITUTE,
144                    'UTF-8',
145                    true
146                );
147
148                if ( ! $class_name ) {
149                    $replacement_code_html[] = $html_encoded_code;
150                } else {
151                    $replacement_code_html[] = \sprintf(
152                        '<span class="%s">%s</span>',
153                        esc_attr( $class_name ),
154                        $html_encoded_code
155                    );
156                }
157            }
158            $replacement_code_html[] = '</div>';
159        }
160
161        // Clear attributes from the PRE tag, replace everything inside the CODE block, trim the end.
162        $bm_pre_open = $processor->bookmarks['_pre_open'];
163        $bm_start    = $processor->bookmarks['_code_content_start'];
164        $bm_end      = $processor->bookmarks['_code_content_end'];
165        $start       = $bm_start->start + $bm_start->length;
166        $length      = $bm_end->start - $start;
167
168        // Remove all attributes from the PRE tag, rewrite it as a plain <pre>.
169        $processor->lexical_updates[] = new WP_HTML_Text_Replacement(
170            $bm_pre_open->start,
171            $bm_pre_open->length,
172            '<pre>'
173        );
174        $processor->lexical_updates[] = new WP_HTML_Text_Replacement(
175            $start,
176            $length,
177            implode( '', $replacement_code_html )
178        );
179        $processor->lexical_updates[] = new WP_HTML_Text_Replacement(
180            $bm_end->start + $bm_end->length,
181            // No need to calculate this precisely, just trim everything after this point.
182            \strlen( $processor->html ),
183            ''
184        );
185
186        // Normalize to ensure HTML that is safer to embed with other HTML.
187        // This ensures tags are correctly closed and extraneous close tags are not present.
188        $html = self::normalize( $processor->get_updated_html() );
189        if ( null === $html ) {
190            return null;
191        }
192
193        return array( $code_string, $html );
194    }
195}