summaryrefslogtreecommitdiff
blob: 566eb7009c300003f5e7dd4f21fca819d56846f2 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
<?php // phpcs:ignore WordPress.Files.FileName.InvalidClassFileName
/**
 * The companion file to shortcodes.php
 *
 * This file contains the code that converts HTML embeds into shortcodes
 * for when the user copy/pastes in HTML.
 *
 * @package automattic/jetpack
 */

add_filter( 'pre_kses', array( 'Filter_Embedded_HTML_Objects', 'filter' ), 11 );
add_filter( 'pre_kses', array( 'Filter_Embedded_HTML_Objects', 'maybe_create_links' ), 100 ); // See WPCom_Embed_Stats::init().

/**
 * Helper class for identifying and parsing known HTML embeds (iframe, object, embed, etc. elements), then converting them to shortcodes.
 * For unknown HTML embeds, the class still tries to convert them to plain links so that at least something is preserved instead of having the entire element stripped by KSES.
 *
 * @since 4.5.0
 */
class Filter_Embedded_HTML_Objects {
	/**
	 * Array of patterns to search for via strpos().
	 * Keys are patterns, values are callback functions that implement the HTML -> shortcode replacement.
	 * Patterns are matched against URLs (src or movie HTML attributes).
	 *
	 * @var array
	 */
	public static $strpos_filters = array();
	/**
	 * Array of patterns to search for via preg_match().
	 * Keys are patterns, values are callback functions that implement the HTML -> shortcode replacement.
	 * Patterns are matched against URLs (src or movie HTML attributes).
	 *
	 * @var array
	 */
	public static $regexp_filters = array();
	/**
	 * HTML element being processed.
	 *
	 * @var string
	 */
	public static $current_element = false;
	/**
	 * Array of patterns to search for via strpos().
	 * Keys are patterns, values are callback functions that implement the HTML -> shortcode replacement.
	 * Patterns are matched against full HTML elements.
	 *
	 * @var array
	 */
	public static $html_strpos_filters = array();
	/**
	 * Array of patterns to search for via preg_match().
	 * Keys are patterns, values are callback functions that implement the HTML -> shortcode replacement.
	 * Patterns are matched against full HTML elements.
	 *
	 * @var array
	 */
	public static $html_regexp_filters = array();
	/**
	 * Failed embeds (stripped)
	 *
	 * @var array
	 */
	public static $failed_embeds = array();

	/**
	 * Store tokens found in Syntax Highlighter.
	 *
	 * @since 4.5.0
	 *
	 * @var array
	 */
	private static $sh_unfiltered_content_tokens;

	/**
	 * Capture tokens found in Syntax Highlighter and collect them in self::$sh_unfiltered_content_tokens.
	 *
	 * @since 4.5.0
	 *
	 * @param array $match Array of Syntax Highlighter matches.
	 *
	 * @return string
	 */
	public static function sh_regexp_callback( $match ) {
		$token                                        = sprintf(
			'[prekses-filter-token-%1$d-%2$s-%1$d]',
			wp_rand(),
			md5( $match[0] )
		);
		self::$sh_unfiltered_content_tokens[ $token ] = $match[0];
		return $token;
	}

	/**
	 * Look for HTML elements that match the registered patterns.
	 * Replace them with the HTML generated by the registered replacement callbacks.
	 *
	 * @param string $html Post content.
	 */
	public static function filter( $html ) {
		if ( ! $html || ! is_string( $html ) ) {
			return $html;
		}

		$regexps = array(
			'object' => '%<object[^>]*+>(?>[^<]*+(?><(?!/object>)[^<]*+)*)</object>%i',
			'embed'  => '%<embed[^>]*+>(?:\s*</embed>)?%i',
			'iframe' => '%<iframe[^>]*+>(?>[^<]*+(?><(?!/iframe>)[^<]*+)*)</iframe>%i',
			'div'    => '%<div[^>]*+>(?>[^<]*+(?><(?!/div>)[^<]*+)*+)(?:</div>)+%i',
			'script' => '%<script[^>]*+>(?>[^<]*+(?><(?!/script>)[^<]*+)*)</script>%i',
		);

		$unfiltered_content_tokens          = array();
		self::$sh_unfiltered_content_tokens = array();

		// Check here to make sure that SyntaxHighlighter is still used. (Just a little future proofing).
		if ( class_exists( 'SyntaxHighlighter' ) ) {
			/*
			 * Replace any "code" shortcode blocks with a token that we'll later replace with its original text.
			 * This will keep the contents of the shortcode from being filtered.
			 */
			global $SyntaxHighlighter; // phpcs:ignore WordPress.NamingConventions.ValidVariableName.VariableNotSnakeCase

			// Check to see if the $syntax_highlighter object has been created and is ready for use.
			if ( isset( $SyntaxHighlighter ) && is_array( $SyntaxHighlighter->shortcodes ) ) { // phpcs:ignore WordPress.NamingConventions.ValidVariableName.VariableNotSnakeCase
				$shortcode_regex           = implode( '|', array_map( 'preg_quote', $SyntaxHighlighter->shortcodes ) ); // phpcs:ignore WordPress.NamingConventions.ValidVariableName.VariableNotSnakeCase
				$html                      = preg_replace_callback(
					'/\[(' . $shortcode_regex . ')(\s[^\]]*)?\][\s\S]*?\[\/\1\]/m',
					array( __CLASS__, 'sh_regexp_callback' ),
					$html
				);
				$unfiltered_content_tokens = self::$sh_unfiltered_content_tokens;
			}
		}

		foreach ( $regexps as $element => $regexp ) {
			self::$current_element = $element;

			if ( false !== stripos( $html, "<$element" ) ) {
				$new_html = preg_replace_callback( $regexp, array( __CLASS__, 'dispatch' ), $html );
				if ( $new_html ) {
					$html = $new_html;
				}
			}

			if ( false !== stripos( $html, "&lt;$element" ) ) {
				$regexp_entities = self::regexp_entities( $regexp );
				$new_html        = preg_replace_callback( $regexp_entities, array( __CLASS__, 'dispatch_entities' ), $html );
				if ( $new_html ) {
					$html = $new_html;
				}
			}
		}

		if ( count( $unfiltered_content_tokens ) > 0 ) {
			// Replace any tokens generated earlier with their original unfiltered text.
			$html = str_replace( array_keys( $unfiltered_content_tokens ), $unfiltered_content_tokens, $html );
		}

		return $html;
	}

	/**
	 * Replace HTML entities in current HTML element regexp.
	 * This is useful when the content is HTML encoded by TinyMCE.
	 *
	 * @param string $regexp Selected regexp.
	 */
	public static function regexp_entities( $regexp ) {
		return preg_replace(
			'/\[\^&([^\]]+)\]\*\+/',
			'(?>[^&]*+(?>&(?!\1)[^&])*+)*+',
			str_replace( '?&gt;', '?' . '>', htmlspecialchars( $regexp, ENT_NOQUOTES ) )
		);
	}

	/**
	 * Register a filter to convert a matching HTML element to a shortcode.
	 *
	 * We can match the provided pattern against the source URL of the HTML element
	 * (generally the value of the src attribute of the HTML element), or against the full HTML element.
	 *
	 * The callback is passed an array containing the raw HTML of the element as well as pre-parsed attribute name/values.
	 *
	 * @param string $match          Pattern to search for: either a regular expression to use with preg_match() or a search string to use with strpos().
	 * @param string $callback       Function used to convert embed into shortcode.
	 * @param bool   $is_regexp      Is $match a regular expression? If true, match using preg_match(). If not, match using strpos(). Default false.
	 * @param bool   $is_html_filter Match the pattern against the full HTML (true) or just the source URL (false)? Default false.
	 */
	public static function register( $match, $callback, $is_regexp = false, $is_html_filter = false ) {
		if ( $is_html_filter ) {
			if ( $is_regexp ) {
				self::$html_regexp_filters[ $match ] = $callback;
			} else {
				self::$html_strpos_filters[ $match ] = $callback;
			}
		} else {
			if ( $is_regexp ) {
				self::$regexp_filters[ $match ] = $callback;
			} else {
				self::$strpos_filters[ $match ] = $callback;
			}
		}
	}

	/**
	 * Delete an existing registered pattern/replacement filter.
	 *
	 * @param string $match Embed regexp.
	 */
	public static function unregister( $match ) {
		// Allow themes/plugins to remove registered embeds.
		unset( self::$regexp_filters[ $match ] );
		unset( self::$strpos_filters[ $match ] );
		unset( self::$html_regexp_filters[ $match ] );
		unset( self::$html_strpos_filters[ $match ] );
	}

	/**
	 * Filter and replace HTML element entity.
	 *
	 * @param array $matches Array of matches.
	 */
	private static function dispatch_entities( $matches ) {
		$orig_html       = $matches[0];
		$decoded_matches = array( html_entity_decode( $matches[0] ) );

		return self::dispatch( $decoded_matches, $orig_html );
	}

	/**
	 * Filter and replace HTML element.
	 *
	 * @param array  $matches Array of matches.
	 * @param string $orig_html Original html. Returned if no results are found via $matches processing.
	 */
	private static function dispatch( $matches, $orig_html = null ) {
		if ( null === $orig_html ) {
			$orig_html = $matches[0];
		}
		$html  = preg_replace( '%&#0*58;//%', '://', $matches[0] );
		$attrs = self::get_attrs( $html );
		if ( isset( $attrs['src'] ) ) {
			$src = $attrs['src'];
		} elseif ( isset( $attrs['movie'] ) ) {
			$src = $attrs['movie'];
		} else {
			// no src found, search html.
			foreach ( self::$html_strpos_filters as $match => $callback ) {
				if ( false !== strpos( $html, $match ) ) {
					return call_user_func( $callback, $attrs );
				}
			}

			foreach ( self::$html_regexp_filters as $match => $callback ) {
				if ( preg_match( $match, $html ) ) {
					return call_user_func( $callback, $attrs );
				}
			}

			return $orig_html;
		}

		$src = trim( $src );

		// check source filter.
		foreach ( self::$strpos_filters as $match => $callback ) {
			if ( false !== strpos( $src, $match ) ) {
				return call_user_func( $callback, $attrs );
			}
		}

		foreach ( self::$regexp_filters as $match => $callback ) {
			if ( preg_match( $match, $src ) ) {
				return call_user_func( $callback, $attrs );
			}
		}

		// check html filters.
		foreach ( self::$html_strpos_filters as $match => $callback ) {
			if ( false !== strpos( $html, $match ) ) {
				return call_user_func( $callback, $attrs );
			}
		}

		foreach ( self::$html_regexp_filters as $match => $callback ) {
			if ( preg_match( $match, $html ) ) {
				return call_user_func( $callback, $attrs );
			}
		}

		// Log the strip.
		if ( function_exists( 'wp_kses_reject' ) ) {
			wp_kses_reject(
				sprintf(
					/* translators: placeholder is an HTML tag. */
					__( '<code>%s</code> HTML tag removed as it is not allowed', 'jetpack' ),
					'&lt;' . self::$current_element . '&gt;'
				),
				array( self::$current_element => $attrs )
			);
		}

		// Keep the failed match so we can later replace it with a link,
		// but return the original content to give others a chance too.
		self::$failed_embeds[] = array(
			'match' => $orig_html,
			'src'   => esc_url( $src ),
		);

		return $orig_html;
	}

	/**
	 * Failed embeds are stripped, so let's convert them to links at least.
	 *
	 * @param string $string Failed embed string.
	 *
	 * @return string $string Linkified string.
	 */
	public static function maybe_create_links( $string ) {
		if ( empty( self::$failed_embeds ) ) {
			return $string;
		}

		foreach ( self::$failed_embeds as $entry ) {
			$html = sprintf( '<a href="%s">%s</a>', esc_url( $entry['src'] ), esc_url( $entry['src'] ) );
			// Check if the string doesn't contain iframe, before replace.
			if ( ! preg_match( '/<iframe /', $string ) ) {
				$string = str_replace( $entry['match'], $html, $string );
			}
		}

		self::$failed_embeds = array();

		return $string;
	}

	/**
	 * Parse post HTML for HTML tags.
	 *
	 * @param string $html Post HTML.
	 */
	public static function get_attrs( $html ) {
		if (
			! ( class_exists( 'DOMDocument' ) && function_exists( 'libxml_use_internal_errors' ) && function_exists( 'simplexml_load_string' ) ) ) {
			trigger_error( // phpcs:ignore WordPress.PHP.DevelopmentFunctions.error_log_trigger_error
				esc_html__( 'PHP’s XML extension is not available. Please contact your hosting provider to enable PHP’s XML extension.', 'jetpack' )
			);
			return array();
		}
		// We have to go through DOM, since it can load non-well-formed XML (i.e. HTML).  SimpleXML cannot.
		$dom = new DOMDocument();
		// The @ is not enough to suppress errors when dealing with libxml,
		// we have to tell it directly how we want to handle errors.
		libxml_use_internal_errors( true );
		// Suppress parser warnings.
		@$dom->loadHTML( $html ); // phpcs:ignore WordPress.PHP.NoSilencedErrors.Discouraged
		libxml_use_internal_errors( false );
		$xml = false;
		// phpcs:disable WordPress.NamingConventions.ValidVariableName.UsedPropertyNotSnakeCase
		foreach ( $dom->childNodes as $node ) {
			// find the root node (html).
			if ( XML_ELEMENT_NODE === $node->nodeType ) {
				/*
				 * Use simplexml_load_string rather than simplexml_import_dom
				 * as the later doesn't cope well if the XML is malformmed in the DOM
				 * See #1688-wpcom.
				 */
				libxml_use_internal_errors( true );
				// html->body->object.
				$xml = simplexml_load_string( $dom->saveXML( $node->firstChild->firstChild ) );
				libxml_clear_errors();
				break;
			}
		}
		// phpcs:enable WordPress.NamingConventions.ValidVariableName.UsedPropertyNotSnakeCase

		if ( ! $xml ) {
			return array();
		}

		$attrs              = array();
		$attrs['_raw_html'] = $html;

		// <param> elements
		foreach ( $xml->param as $param ) {
			$attrs[ (string) $param['name'] ] = (string) $param['value'];
		}

		// <object> attributes
		foreach ( $xml->attributes() as $name => $attr ) {
			$attrs[ $name ] = (string) $attr;
		}

		// <embed> attributes
		if ( $xml->embed ) {
			foreach ( $xml->embed->attributes() as $name => $attr ) {
				$attrs[ $name ] = (string) $attr;
			}
		}

		return $attrs;
	}
}