Code Coverage for projects/packages/jetpack-mu-wpcom/src/features/wpcom-blocks/code/class-code-block-html-replacer.php

	Code Coverage
	Lines			Functions and Methods				Classes and Traits
Total	0.00% covered (danger)	0.00%	0 / 91	0.00% covered (danger)	0.00%	0 / 1	CRAP	0.00% covered (danger)	0.00%	0 / 1
Code_Block_HTML_Replacer	0.00% covered (danger)	0.00%	0 / 91	0.00% covered (danger)	0.00%	0 / 1	702	0.00% covered (danger)	0.00%	0 / 1
get_updated_html_with_replaced_content	0.00% covered (danger)	0.00%	0 / 91	0.00% covered (danger)	0.00%	0 / 1	702

1	<?php
2	/**
3	* HTML Replacer class for the Code Block.
4	*
5	* @package automattic/jetpack-mu-wpcom
6	*/
7
8	declare( strict_types = 1 );
9
10	namespace Automattic\Jetpack;
11
12	use WP_HTML_Processor;
13	use WP_HTML_Text_Replacement;
14
15	/**
16	* Safely replace block HTML content with tokenized HTML.
17	*/
18	class Code_Block_HTML_Replacer extends WP_HTML_Processor {
19	/**
20	* Replace the code block content with the tokenize HTML.
21	*
22	* This extracts the original code text and provides the updated HTML string
23	* with the tokenized HTML inserted. The HTML structure and replacement
24	* contents are checked to ensure safety.
25	*
26	* @param string $html The HTML string containing the code block.
27	* @param array $tokenized_code_data The tokenized code data.
28	* @param string\|null $language_name The language name, if any.
29	* @return null\|array{0: string, 1: string} Null on failure, or array with original code string
30	* and the tokenized HTML markup.
31	*/
32	public static function get_updated_html_with_replaced_content( string $html, array $tokenized_code_data, ?string $language_name ): ?array {
33	$processor = self::create_fragment( $html );
34
35	// Skip leading whitespace
36	while (
37	$processor->next_token()
38	&& $processor->get_token_type() === '#text'
39	&& $processor->text_node_classification === self::TEXT_IS_WHITESPACE
40	) {
41	continue;
42	}
43
44	// The serialized PRE tag has block wrapper attributes.
45	// Remove them, they'll be applied in a wrapper.
46	if ( $processor->get_tag() !== 'PRE' ) {
47	return null;
48	} else {
49	$processor->set_bookmark( 'pre_open' );
50	}
51
52	// The next token should be the CODE tag opener.
53	if ( ! $processor->next_token() \|\| $processor->get_tag() !== 'CODE' ) {
54	return null;
55	}
56
57	if ( $language_name ) {
58	$processor->add_class(
59	'language-' .
60	\strtr(
61	\strtolower( $language_name ),
62	array(
63	' ' => '_',
64	"\t" => '_',
65	"\n" => '_',
66	"\r" => '_',
67	"\f" => '_',
68	)
69	)
70	);
71	$processor->get_updated_html();
72	}
73	$processor->set_bookmark( 'code_content_start' );
74
75	/*
76	* The code should be 1 HTML CODE element containing the text.
77	* <code>### text ###</code>.
78	* OR it can be an empty CODE element:
79	* <code></code>
80	*/
81	if ( ! $processor->next_token() ) {
82	return null;
83	}
84	if ( $processor->get_token_type() === '#text' ) {
85	$code_string = $processor->get_modifiable_text();
86	if ( ! $processor->next_token() ) {
87	return null;
88	}
89	} else {
90	$code_string = '';
91	}
92
93	// This must be the closing CODE tag of <code>…text…</code> or empty <code></code>.
94	if ( $processor->get_tag() !== 'CODE' \|\| ! $processor->is_tag_closer() ) {
95	return null;
96	}
97	$processor->set_bookmark( 'code_content_end' );
98
99	if (
100	! isset( $processor->bookmarks['_pre_open'] ) \|\|
101	! isset( $processor->bookmarks['_code_content_start'] ) \|\|
102	! isset( $processor->bookmarks['_code_content_end'] )
103	) {
104	return null;
105	}
106
107	$replacement_code_html = array();
108	foreach ( $tokenized_code_data as $line ) {
109	$replacement_code_html[] = '<div class="cm-line">';
110	foreach ( $line as $chunk ) {
111	if (
112	! \is_array( $chunk ) \|\|
113	! isset( $chunk[0] ) \|\|
114	! \is_string( $chunk[0] ) \|\|
115	( isset( $chunk[1] ) && ! \is_string( $chunk[1] ) )
116	) {
117	return null;
118	}
119
120	// phpcs:ignore WordPress.PHP.DiscouragedPHPFunctions.obfuscation_base64_decode
121	$code = base64_decode( $chunk[0], true );
122	if ( false === $code ) {
123	return null;
124	}
125
126	$class_name = $chunk[1] ?? null;
127
128	/*
129	* Do not rely on `esc_html`. It would mishandle character references
130	* that appear to be encoded already. HTML like `&` would be
131	* ignored, resulting in `&` rendering in the browser instead of the
132	* desired `&` which must be HTML encoded as `&amp;`.
133	*
134	* - ENT_NOQUOTES: Quote characters do not require encoding in HTML text nodes.
135	* - ENT_SUBSTITUTE: Replace invalid code unit sequences with a Unicode
136	* substitution character. This is unexpected, but safe.
137	* - 'UTF-8' The base64 encoding from JavaScript is UTF-8.
138	* - true: Force HTML character references to be used for `&`, `<`, `>`
139	* in the input string.
140	*/
141	$html_encoded_code = htmlspecialchars(
142	$code,
143	ENT_NOQUOTES \| ENT_SUBSTITUTE,
144	'UTF-8',
145	true
146	);
147
148	if ( ! $class_name ) {
149	$replacement_code_html[] = $html_encoded_code;
150	} else {
151	$replacement_code_html[] = \sprintf(
152	'<span class="%s">%s</span>',
153	esc_attr( $class_name ),
154	$html_encoded_code
155	);
156	}
157	}
158	$replacement_code_html[] = '</div>';
159	}
160
161	// Clear attributes from the PRE tag, replace everything inside the CODE block, trim the end.
162	$bm_pre_open = $processor->bookmarks['_pre_open'];
163	$bm_start = $processor->bookmarks['_code_content_start'];
164	$bm_end = $processor->bookmarks['_code_content_end'];
165	$start = $bm_start->start + $bm_start->length;
166	$length = $bm_end->start - $start;
167
168	// Remove all attributes from the PRE tag, rewrite it as a plain <pre>.
169	$processor->lexical_updates[] = new WP_HTML_Text_Replacement(
170	$bm_pre_open->start,
171	$bm_pre_open->length,
172	'<pre>'
173	);
174	$processor->lexical_updates[] = new WP_HTML_Text_Replacement(
175	$start,
176	$length,
177	implode( '', $replacement_code_html )
178	);
179	$processor->lexical_updates[] = new WP_HTML_Text_Replacement(
180	$bm_end->start + $bm_end->length,
181	// No need to calculate this precisely, just trim everything after this point.
182	\strlen( $processor->html ),
183	''
184	);
185
186	// Normalize to ensure HTML that is safer to embed with other HTML.
187	// This ensures tags are correctly closed and extraneous close tags are not present.
188	$html = self::normalize( $processor->get_updated_html() );
189	if ( null === $html ) {
190	return null;
191	}
192
193	return array( $code_string, $html );
194	}
195	}