Code Coverage for projects/plugins/jetpack/_inc/lib/class.media-extractor.php

	Code Coverage
	Lines			Functions and Methods				Classes and Traits
Total	85.31% covered (warning)	85.31%	209 / 245	37.50% covered (danger)	37.50%	3 / 8	CRAP	0.00% covered (danger)	0.00%	0 / 1
Jetpack_Media_Meta_Extractor	85.31% covered (warning)	85.31%	209 / 245	37.50% covered (danger)	37.50%	3 / 8	131.73	0.00% covered (danger)	0.00%	0 / 1
extract	78.95% covered (warning)	78.95%	15 / 19	0.00% covered (danger)	0.00%	0 / 1	7.46
extract_from_content	85.93% covered (warning)	85.93%	116 / 135	0.00% covered (danger)	0.00%	0 / 1	68.70
get_image_fields	70.97% covered (warning)	70.97%	22 / 31	0.00% covered (danger)	0.00%	0 / 1	9.57
reduce_extracted_images	86.67% covered (warning)	86.67%	13 / 15	0.00% covered (danger)	0.00%	0 / 1	6.09
extract_images_from_content	100.00% covered (success)	100.00%	2 / 2	100.00% covered (success)	100.00%	1 / 1	1
build_image_struct	100.00% covered (success)	100.00%	11 / 11	100.00% covered (success)	100.00%	1 / 1	4
get_images_from_html	92.86% covered (success)	92.86%	26 / 28	0.00% covered (danger)	0.00%	0 / 1	14.07
get_stripped_content	100.00% covered (success)	100.00%	4 / 4	100.00% covered (success)	100.00%	1 / 1	1

1	<?php // phpcs:ignore WordPress.Files.FileName.InvalidClassFileName
2	/**
3	* Class with methods to extract metadata from a post/page about videos, images, links, mentions embedded
4	* in or attached to the post/page.
5	*
6	* @package automattic/jetpack
7	*/
8
9	use Automattic\Jetpack\Post_Media\Images;
10
11	/**
12	* Class with methods to extract metadata from a post/page about videos, images, links, mentions embedded
13	* in or attached to the post/page.
14	*
15	* @todo Additionally, have some filters on number of items in each field
16	*/
17	class Jetpack_Media_Meta_Extractor {
18
19	// Some consts for what to extract.
20	const ALL = 255;
21	const LINKS = 1;
22	const MENTIONS = 2;
23	const IMAGES = 4;
24	const SHORTCODES = 8; // Only the keeper shortcodes below.
25	const EMBEDS = 16;
26	const HASHTAGS = 32;
27
28	/**
29	* Shortcodes to keep.
30	*
31	* For these, we try to extract some data from the shortcode, rather than just recording its presence (which we do for all)
32	* There should be a function get_{shortcode}_id( $atts ) or static method SomethingShortcode::get_{shortcode}_id( $atts ) for these.
33	*
34	* @var string[]
35	*/
36	private static $keeper_shortcodes = array(
37	'audio',
38	'youtube',
39	'vimeo',
40	'hulu',
41	'ted',
42	'video',
43	'wpvideo',
44	'videopress',
45	);
46
47	/**
48	* Gets the specified media and meta info from the given post.
49	* NOTE: If you have the post's HTML content already and don't need image data, use extract_from_content() instead.
50	*
51	* @param int $blog_id The ID of the blog.
52	* @param int $post_id The ID of the post.
53	* @param int $what_to_extract A mask of things to extract, e.g. Jetpack_Media_Meta_Extractor::IMAGES \| Jetpack_Media_Meta_Extractor::MENTIONS.
54	* @param boolean $extract_alt_text Should alt_text be extracted, defaults to false.
55	*
56	* @return array\|WP_Error a structure containing metadata about the embedded things, or empty array if nothing found, or WP_Error on error.
57	*/
58	public static function extract( $blog_id, $post_id, $what_to_extract = self::ALL, $extract_alt_text = false ) {
59
60	// multisite?
61	if ( function_exists( 'switch_to_blog' ) ) {
62	switch_to_blog( $blog_id );
63	}
64
65	$post = get_post( $post_id );
66	if ( ! $post instanceof WP_Post ) {
67	if ( function_exists( 'restore_current_blog' ) ) {
68	restore_current_blog();
69	}
70	return array();
71	}
72	$content = $post->post_title . "\n\n" . $post->post_content;
73	$char_cnt = strlen( $content );
74
75	// prevent running extraction on really huge amounts of content.
76	if ( $char_cnt > 100000 ) { // about 20k English words.
77	$content = substr( $content, 0, 100000 );
78	}
79
80	$extracted = array();
81
82	// Get images first, we need the full post for that.
83	if ( self::IMAGES & $what_to_extract ) {
84	$extracted = self::get_image_fields( $post, array(), $extract_alt_text );
85
86	// Turn off images so we can safely call extract_from_content() below.
87	$what_to_extract -= self::IMAGES;
88	}
89
90	if ( function_exists( 'restore_current_blog' ) ) {
91	restore_current_blog();
92	}
93
94	// All of the other things besides images can be extracted from just the content.
95	$extracted = self::extract_from_content( $content, $what_to_extract, $extracted );
96
97	return $extracted;
98	}
99
100	/**
101	* Gets the specified meta info from the given post content.
102	* NOTE: If you want IMAGES, call extract( $blog_id, $post_id, ...) which will give you more/better image extraction
103	* This method will give you an error if you ask for IMAGES.
104	*
105	* @param string $content The HTML post_content of a post.
106	* @param int $what_to_extract A mask of things to extract, e.g. Jetpack_Media_Meta_Extractor::IMAGES \| Jetpack_Media_Meta_Extractor::MENTIONS.
107	* @param array $already_extracted Previously extracted things, e.g. images from extract(), which can be used for x-referencing here.
108	*
109	* @return array a structure containing metadata about the embedded things, or empty array if nothing found, or WP_Error on error.
110	*/
111	public static function extract_from_content( $content, $what_to_extract = self::ALL, $already_extracted = array() ) {
112	$stripped_content = self::get_stripped_content( $content );
113
114	// Maybe start with some previously extracted things (e.g. images from extract().
115	$extracted = $already_extracted;
116
117	// Embedded media objects will have already been converted to shortcodes by pre_kses hooks on save.
118
119	if ( self::IMAGES & $what_to_extract ) {
120	$images = self::extract_images_from_content( $stripped_content, array() );
121	$extracted = array_merge( $extracted, $images );
122	}
123
124	// ----------------------------------- MENTIONS ------------------------------
125
126	if ( self::MENTIONS & $what_to_extract ) {
127	if ( preg_match_all( '/(^\|\s)@(\w+)/u', $stripped_content, $matches ) ) {
128	$mentions = array_values( array_unique( $matches[2] ) ); // array_unique() retains the keys!
129	$mentions = array_map( 'strtolower', $mentions );
130	$extracted['mention'] = array( 'name' => $mentions );
131	if ( ! isset( $extracted['has'] ) ) {
132	$extracted['has'] = array();
133	}
134	$extracted['has']['mention'] = count( $mentions );
135	}
136	}
137
138	// ----------------------------------- HASHTAGS ------------------------------
139	/**
140	* Some hosts may not compile with --enable-unicode-properties and kick a warning:
141	* Warning: preg_match_all() [function.preg-match-all]: Compilation failed: support for \P, \p, and \X has not been compiled
142	* Therefore, we only run this code block on wpcom, not in Jetpack.
143	*/
144	if ( ( defined( 'IS_WPCOM' ) && IS_WPCOM ) && ( self::HASHTAGS & $what_to_extract ) ) {
145	// This regex does not exactly match Twitter's
146	// if there are problems/complaints we should implement this:
147	// https://github.com/twitter/twitter-text/blob/master/java/src/com/twitter/Regex.java .
148	if ( preg_match_all( '/(?:^\|\s)#(\w\p{L}+\w)/u', $stripped_content, $matches ) ) {
149	$hashtags = array_values( array_unique( $matches[1] ) ); // array_unique() retains the keys!
150	$hashtags = array_map( 'strtolower', $hashtags );
151	$extracted['hashtag'] = array( 'name' => $hashtags );
152	if ( ! isset( $extracted['has'] ) ) {
153	$extracted['has'] = array();
154	}
155	$extracted['has']['hashtag'] = count( $hashtags );
156	}
157	}
158
159	// ----------------------------------- SHORTCODES ------------------------------
160
161	// Always look for shortcodes.
162	// If we don't want them, we'll just remove them, so we don't grab them as links below.
163	$shortcode_pattern = '/' . get_shortcode_regex() . '/s';
164	if ( preg_match_all( $shortcode_pattern, $content, $matches ) ) {
165
166	$shortcode_total_count = 0;
167	$shortcode_type_counts = array();
168	$shortcode_types = array();
169	$shortcode_details = array();
170
171	if ( self::SHORTCODES & $what_to_extract ) {
172
173	foreach ( $matches[2] as $key => $shortcode ) {
174	// Elasticsearch (and probably other things) doesn't deal well with some chars as key names.
175	$shortcode_name = preg_replace( '/[.,*"\'\/\\\\#+ ]/', '_', $shortcode );
176
177	$attr = shortcode_parse_atts( $matches[3][ $key ] );
178
179	++$shortcode_total_count;
180	if ( ! isset( $shortcode_type_counts[ $shortcode_name ] ) ) {
181	$shortcode_type_counts[ $shortcode_name ] = 0;
182	}
183	++$shortcode_type_counts[ $shortcode_name ];
184
185	// Store (uniquely) presence of all shortcode regardless of whether it's a keeper (for those, get ID below)
186	// @todo Store number of occurrences?
187	if ( ! in_array( $shortcode_name, $shortcode_types, true ) ) {
188	$shortcode_types[] = $shortcode_name;
189	}
190
191	// For keeper shortcodes, also store the id/url of the object (e.g. youtube video, TED talk, etc.).
192	if ( in_array( $shortcode, self::$keeper_shortcodes, true ) ) {
193	// Clear shortcode ID data left from the last shortcode.
194	$id = null;
195	// We'll try to get the salient ID from the function jetpack_shortcode_get_xyz_id().
196	// If the shortcode is a class, we'll call XyzShortcode::get_xyz_id().
197	$shortcode_get_id_func = "jetpack_shortcode_get_{$shortcode}_id";
198	$shortcode_class_name = ucfirst( $shortcode ) . 'Shortcode';
199	$shortcode_get_id_method = "get_{$shortcode}_id";
200	if ( function_exists( $shortcode_get_id_func ) ) {
201	$id = call_user_func( $shortcode_get_id_func, $attr );
202	} elseif ( method_exists( $shortcode_class_name, $shortcode_get_id_method ) ) {
203	$id = call_user_func( array( $shortcode_class_name, $shortcode_get_id_method ), $attr );
204	} elseif ( 'video' === $shortcode ) {
205	$id = $attr['src'] ?? $attr['url'] ?? $attr['mp4'] ?? $attr['m4v'] ?? $attr['webm'] ?? $attr['ogv'] ?? $attr['wmv'] ?? $attr['flv'] ?? null;
206	} elseif ( 'audio' === $shortcode ) {
207	preg_match( '#(https?://(?:[^\s"\|\']+)\.(?:mp3\|ogg\|flac\|m4a\|wav))([ "\'\|]\|$)#', implode( ' ', $attr ), $audio_matches );
208	$id = $audio_matches[1] ?? null;
209	}
210	if ( ! empty( $id )
211	&& ( ! isset( $shortcode_details[ $shortcode_name ] ) \|\| ! in_array( $id, $shortcode_details[ $shortcode_name ], true ) ) ) {
212	$shortcode_details[ $shortcode_name ][] = $id;
213	}
214	}
215	}
216
217	if ( $shortcode_total_count > 0 ) {
218	// Add the shortcode info to the $extracted array.
219	if ( ! isset( $extracted['has'] ) ) {
220	$extracted['has'] = array();
221	}
222	$extracted['has']['shortcode'] = $shortcode_total_count;
223	$extracted['shortcode'] = array();
224	foreach ( $shortcode_type_counts as $type => $count ) {
225	$extracted['shortcode'][ $type ] = array( 'count' => $count );
226	}
227	if ( ! empty( $shortcode_types ) ) {
228	$extracted['shortcode_types'] = $shortcode_types;
229	}
230	foreach ( $shortcode_details as $type => $id ) {
231	$extracted['shortcode'][ $type ]['id'] = $id;
232	}
233	}
234	}
235
236	// Remove the shortcodes form our copy of $content, so we don't count links in them as links below.
237	$content = preg_replace( $shortcode_pattern, ' ', $content );
238	}
239
240	// ----------------------------------- LINKS ------------------------------
241
242	if ( self::LINKS & $what_to_extract ) {
243
244	// To hold the extracted stuff we find.
245	$links = array();
246
247	// @todo Get the text inside the links?
248
249	// Grab any links, whether in <a href="..." or not, but subtract those from shortcodes and images.
250	// (we treat embed links as just another link).
251	if ( preg_match_all( '#(?:^\|\s\|"\|\')(https?://([^\s()<>]+(?:$[\w\d]+$\|([^[:punct:]\s]\|/))))#', $content, $matches ) ) {
252
253	foreach ( $matches[1] as $link_raw ) {
254	$url = wp_parse_url( $link_raw );
255
256	// Data URI links.
257	if ( ! isset( $url['scheme'] ) \|\| 'data' === $url['scheme'] ) {
258	continue;
259	}
260
261	// Reject invalid URLs.
262	if ( ! isset( $url['host'] ) ) {
263	continue;
264	}
265
266	// Remove large (and likely invalid) links.
267	if ( 4096 < strlen( $link_raw ) ) {
268	continue;
269	}
270
271	// Build a simple form of the URL so we can compare it to ones we found in IMAGES or SHORTCODES and exclude those.
272	$simple_url = $url['scheme'] . '://' . $url['host'] . ( ! empty( $url['path'] ) ? $url['path'] : '' );
273	if ( isset( $extracted['image']['url'] ) ) {
274	if ( in_array( $simple_url, (array) $extracted['image']['url'], true ) ) {
275	continue;
276	}
277	}
278
279	list( $proto, $link_all_but_proto ) = explode( '://', $link_raw ); // phpcs:ignore VariableAnalysis.CodeAnalysis.VariableAnalysis.UnusedVariable
280
281	// Build a reversed hostname.
282	$host_parts = array_reverse( explode( '.', $url['host'] ) );
283	$host_reversed = '';
284	foreach ( $host_parts as $part ) {
285	$host_reversed .= ( ! empty( $host_reversed ) ? '.' : '' ) . $part;
286	}
287
288	$link_analyzed = '';
289	if ( ! empty( $url['path'] ) ) {
290	// The whole path (no query args or fragments).
291	$path = substr( $url['path'], 1 ); // strip the leading '/'.
292	$link_analyzed .= ( ! empty( $link_analyzed ) ? ' ' : '' ) . $path;
293
294	// The path split by /.
295	$path_split = explode( '/', $path );
296	if ( count( $path_split ) > 1 ) {
297	$link_analyzed .= ' ' . implode( ' ', $path_split );
298	}
299
300	// The fragment.
301	if ( ! empty( $url['fragment'] ) ) {
302	$link_analyzed .= ( ! empty( $link_analyzed ) ? ' ' : '' ) . $url['fragment'];
303	}
304	}
305
306	$link = array(
307	'url' => $link_all_but_proto,
308	'host_reversed' => $host_reversed,
309	'host' => $url['host'],
310	);
311	if ( ! in_array( $link, $links, true ) ) {
312	$links[] = $link;
313	}
314	}
315	}
316
317	$link_count = count( $links );
318	if ( $link_count ) {
319	$extracted['link'] = $links;
320	if ( ! isset( $extracted['has'] ) ) {
321	$extracted['has'] = array();
322	}
323	$extracted['has']['link'] = $link_count;
324	}
325	}
326
327	// ----------------------------------- EMBEDS ------------------------------
328
329	// Embeds are just individual links on their own line.
330	if ( self::EMBEDS & $what_to_extract ) {
331
332	if ( ! function_exists( '_wp_oembed_get_object' ) ) {
333	include ABSPATH . WPINC . '/class-oembed.php';
334	}
335
336	// get an oembed object.
337	$oembed = _wp_oembed_get_object();
338
339	// Grab any links on their own lines that may be embeds.
340	if ( preg_match_all( '\|^\s(https?://[^\s"]+)\s$\|im', $content, $matches ) ) {
341
342	// To hold the extracted stuff we find.
343	$embeds = array();
344
345	foreach ( $matches[1] as $link_raw ) {
346	$url = wp_parse_url( $link_raw );
347
348	list( $proto, $link_all_but_proto ) = explode( '://', $link_raw ); // phpcs:ignore VariableAnalysis.CodeAnalysis.VariableAnalysis.UnusedVariable
349
350	// Check whether this "link" is really an embed.
351	foreach ( $oembed->providers as $matchmask => $data ) {
352	// Guard against malformed oEmbed providers.
353	if ( ! isset( $data[0] ) ) {
354	continue;
355	}
356	$regex = $data[1] ?? false;
357
358	// Turn the asterisk-type provider URLs into regex.
359	if ( ! $regex ) {
360	$matchmask = '#' . str_replace( '___wildcard___', '(.+)', preg_quote( str_replace( '*', '___wildcard___', $matchmask ), '#' ) ) . '#i';
361	$matchmask = preg_replace( '\|^#http\\\://\|', '#https?\://', $matchmask );
362	}
363
364	if ( preg_match( $matchmask, $link_raw ) ) {
365	$embeds[] = $link_all_but_proto; // @todo Check unique before adding
366
367	// @todo Try to get ID's for the ones we care about (shortcode_keepers)
368	break;
369	}
370	}
371	}
372
373	if ( ! empty( $embeds ) ) {
374	if ( ! isset( $extracted['has'] ) ) {
375	$extracted['has'] = array();
376	}
377	$extracted['has']['embed'] = count( $embeds );
378	$extracted['embed'] = array( 'url' => array() );
379	foreach ( $embeds as $e ) {
380	$extracted['embed']['url'][] = $e;
381	}
382	}
383	}
384	}
385
386	return $extracted;
387	}
388
389	/**
390	* Get image fields for matching images.
391	*
392	* @uses Images
393	*
394	* @param WP_Post $post A post object.
395	* @param array $args Optional args, see defaults list for details.
396	* @param boolean $extract_alt_text Should alt_text be extracted, defaults to false.
397	*
398	* @return array Returns an array of all images meeting the specified criteria in $args.
399	*/
400	private static function get_image_fields( $post, $args = array(), $extract_alt_text = false ) {
401
402	if ( ! $post instanceof WP_Post ) {
403	return array();
404	}
405
406	$defaults = array(
407	'width' => 200, // Required minimum width (if possible to determine).
408	'height' => 200, // Required minimum height (if possible to determine).
409	);
410
411	$args = wp_parse_args( $args, $defaults );
412
413	$image_list = array();
414	$image_booleans = array();
415	$image_booleans['gallery'] = 0;
416
417	$from_featured_image = Images::from_thumbnail( $post->ID, $args['width'], $args['height'] );
418	if ( ! empty( $from_featured_image ) ) {
419	if ( $extract_alt_text ) {
420	$image_list = array_merge( $image_list, self::reduce_extracted_images( $from_featured_image ) );
421	} else {
422	$srcs = wp_list_pluck( $from_featured_image, 'src' );
423	$image_list = array_merge( $image_list, $srcs );
424	}
425	}
426
427	$from_slideshow = Images::from_slideshow( $post->ID, $args['width'], $args['height'] );
428	if ( ! empty( $from_slideshow ) ) {
429	if ( $extract_alt_text ) {
430	$image_list = array_merge( $image_list, self::reduce_extracted_images( $from_slideshow ) );
431	} else {
432	$srcs = wp_list_pluck( $from_slideshow, 'src' );
433	$image_list = array_merge( $image_list, $srcs );
434	}
435	}
436
437	$from_gallery = Images::from_gallery( $post->ID );
438	if ( ! empty( $from_gallery ) ) {
439	if ( $extract_alt_text ) {
440	$image_list = array_merge( $image_list, self::reduce_extracted_images( $from_gallery ) );
441	} else {
442	$srcs = wp_list_pluck( $from_gallery, 'src' );
443	$image_list = array_merge( $image_list, $srcs );
444	}
445	++$image_booleans['gallery']; // @todo This count isn't correct, will only every count 1
446	}
447
448	// @todo Can we check width/height of these efficiently? Could maybe use query args at least, before we strip them out
449	$image_list = self::get_images_from_html( $post->post_content, $image_list, $extract_alt_text );
450
451	return self::build_image_struct( $image_list, $image_booleans );
452	}
453
454	/**
455	* Given an extracted image array reduce to src, alt_text, src_width, and src_height.
456	*
457	* @param array $images extracted image array.
458	*
459	* @return array reduced image array
460	*/
461	protected static function reduce_extracted_images( $images ) {
462	$ret_images = array();
463	foreach ( $images as $image ) {
464	// skip if src isn't set.
465	if ( empty( $image['src'] ) ) {
466	continue;
467	}
468	$ret_image = array(
469	'url' => $image['src'],
470	);
471	if ( ! empty( $image['src_height'] ) \|\| ! empty( $image['src_width'] ) ) {
472	$ret_image['src_width'] = $image['src_width'] ?? '';
473	$ret_image['src_height'] = $image['src_height'] ?? '';
474	}
475	if ( ! empty( $image['alt_text'] ) ) {
476	$ret_image['alt_text'] = $image['alt_text'];
477	} else {
478	$ret_image = $image['src'];
479	}
480	$ret_images[] = $ret_image;
481	}
482	return $ret_images;
483	}
484
485	/**
486	* Helper function to get images from HTML and return it with the set sturcture.
487	*
488	* @param string $content HTML content.
489	* @param array $image_list Array of already found images.
490	* @param string $extract_alt_text Whether or not to extract the alt text.
491	*
492	* @return array\|array[] Array of images.
493	*/
494	public static function extract_images_from_content( $content, $image_list, $extract_alt_text = false ) {
495	$image_list = self::get_images_from_html( $content, $image_list, $extract_alt_text );
496	return self::build_image_struct( $image_list, array() );
497	}
498
499	/**
500	* Produces a set structure for extracted media items.
501	*
502	* @param array $image_list Array of images.
503	* @param array $image_booleans Image booleans.
504	*
505	* @return array\|array[]
506	*/
507	public static function build_image_struct( $image_list, $image_booleans ) {
508	if ( ! empty( $image_list ) ) {
509	$retval = array( 'image' => array() );
510	$image_list = array_unique( $image_list, SORT_REGULAR );
511	foreach ( $image_list as $img ) {
512	if ( is_string( $img ) ) {
513	$retval['image'][] = array( 'url' => $img );
514	} else {
515	$retval['image'][] = $img;
516	}
517	}
518	$image_booleans['image'] = count( $retval['image'] );
519	$retval['has'] = $image_booleans;
520	return $retval;
521	} else {
522	return array();
523	}
524	}
525
526	/**
527	* Extracts images from html.
528	*
529	* @param string $html Some markup, possibly containing image tags.
530	* @param array $images_already_extracted (just an array of image URLs without query strings, no special structure), used for de-duplication.
531	* @param boolean $extract_alt_text Should alt_text be extracted, defaults to false.
532	*
533	* @return array Image URLs extracted from the HTML, stripped of query params and de-duped
534	*/
535	public static function get_images_from_html( $html, $images_already_extracted, $extract_alt_text = false ) {
536	$image_list = $images_already_extracted;
537	$from_html = Images::from_html( $html );
538	// early return if no image in html.
539	if ( empty( $from_html ) ) {
540	return $image_list;
541	}
542	// process images.
543	foreach ( $from_html as $extracted_image ) {
544	$image_url = $extracted_image['src'];
545	$length = strpos( $image_url, '?' );
546	$src = wp_parse_url( $image_url );
547
548	if ( $src && isset( $src['scheme'] ) && isset( $src['host'] ) && isset( $src['path'] ) ) {
549	// Rebuild the URL without the query string.
550	$queryless = $src['scheme'] . '://' . $src['host'] . $src['path'];
551	} elseif ( $length ) {
552	// If wp_parse_url() didn't work, strip off the query string the old fashioned way.
553	$queryless = substr( $image_url, 0, $length );
554	} else {
555	// Failing that, there was no spoon! Err ... query string!
556	$queryless = $image_url;
557	}
558
559	// Discard URLs that are longer then 4KB, these are likely data URIs or malformed HTML.
560	if ( 4096 < strlen( $queryless ) ) {
561	continue;
562	}
563
564	if ( ! in_array( $queryless, $image_list, true ) ) {
565	$image_to_add = array(
566	'url' => $queryless,
567	);
568	if ( $extract_alt_text ) {
569	if ( ! empty( $extracted_image['alt_text'] ) ) {
570	$image_to_add['alt_text'] = $extracted_image['alt_text'];
571	}
572	if ( ! empty( $extracted_image['src_width'] ) \|\| ! empty( $extracted_image['src_height'] ) ) {
573	$image_to_add['src_width'] = $extracted_image['src_width'];
574	$image_to_add['src_height'] = $extracted_image['src_height'];
575	}
576	} else {
577	$image_to_add = $queryless;
578	}
579	$image_list[] = $image_to_add;
580	}
581	}
582	return $image_list;
583	}
584
585	/**
586	* Strips concents of all tags, shortcodes, and decodes HTML entities.
587	*
588	* @param string $content Original content.
589	*
590	* @return string Cleaned content.
591	*/
592	private static function get_stripped_content( $content ) {
593	$clean_content = wp_strip_all_tags( $content );
594	$clean_content = html_entity_decode( $clean_content, ENT_QUOTES \| ENT_SUBSTITUTE \| ENT_HTML401 );
595	// completely strip shortcodes and any content they enclose.
596	$clean_content = strip_shortcodes( $clean_content );
597	return $clean_content;
598	}
599	}