summaryrefslogtreecommitdiff
blob: 08035abb85749a7d2237a646351e79d8c4cba4ed (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
<?php
declare( strict_types = 1 );

namespace MediaWiki\Extension\Translate\TranslatorInterface;

use Collation;
use MessageGroups;
use SplMinHeap;
use WANObjectCache;
use Wikimedia\LightweightObjectStore\ExpirationAwareness;

/**
 * Service for searching message groups and message keys.
 * @author Niklas Laxström
 * @license GPL-2.0-or-later
 */
class EntitySearch {
	private const FIELD_DELIMITER = "\x7F";
	/** @var WANObjectCache */
	private $cache;
	/** @var Collation */
	private $collation;
	/** @var MessageGroups */
	private $messageGroupFactory;

	public function __construct( WANObjectCache $cache, Collation $collation, MessageGroups $messageGroupFactory ) {
		$this->cache = $cache;
		$this->collation = $collation;
		$this->messageGroupFactory = $messageGroupFactory;
	}

	public function searchStaticMessageGroups( string $query, int $maxResults ): array {
		$cache = $this->cache;
		// None of the static groups currently use language-dependent labels. This
		// may need revisiting later and splitting the cache by language.
		$key = $cache->makeKey( 'Translate', 'EntitySearch', 'static-groups' );
		$haystack = $cache->getWithSetCallback(
			$key,
			ExpirationAwareness::TTL_HOUR,
			function (): string {
				return $this->getStaticMessageGroupsHaystack();
			},
			[
				// Calling touchCheckKey() on this key purges the cache
				'checkKeys' => [ $this->messageGroupFactory->getCacheKey() ],
				// Avoid querying cache servers multiple times in a web request
				'pcTTL' => $cache::TTL_PROC_LONG
			]
		);

		// Algorithm: Construct one big string with one entity per line. Then run
		// preg_match_all twice over it, first to collect prefix match (to show them
		// first), then to match words if more results are needed.
		$results = [];

		$delimiter = self::FIELD_DELIMITER;
		$anything = "[^$delimiter\n]";
		$query = preg_quote( $query, '/' );
		// Prefix match
		$pattern = "/^($query$anything*)$delimiter($anything+)$/miu";
		preg_match_all( $pattern, $haystack, $matches, PREG_SET_ORDER );
		foreach ( $matches as [ , $label, $groupId ] ) {
			// Index by $groupId to avoid duplicates from the prefix match and the word match
			$results[$groupId] = [
				'label' => $label,
				'group' => $groupId,
			];

			if ( count( $results ) >= $maxResults ) {
				return array_values( $results );
			}
		}

		// Word match
		$pattern = "/^($anything*\b$query$anything*)$delimiter($anything+)$/miu";
		preg_match_all( $pattern, $haystack, $matches, PREG_SET_ORDER );
		foreach ( $matches as [ , $label, $groupId ] ) {
			$results[$groupId] = [
				'label' => $label,
				'group' => $groupId,
			];

			if ( count( $results ) >= $maxResults ) {
				return array_values( $results );
			}
		}

		return array_values( $results );
	}

	private function getStaticMessageGroupsHaystack(): string {
		$groups = $this->messageGroupFactory->getGroups();
		$data = new SplMinHeap();
		foreach ( $groups as $group ) {
			$label = $group->getLabel();
			// Ensure there are no special chars that will break matching
			$label = strtr( $label, [ self::FIELD_DELIMITER => '', "\n" => '' ] );
			$sortKey = $this->collation->getSortKey( $label );
			// It is unlikely that different groups have the same label (or sort key),
			// but it's possible.
			$data->insert( [ $sortKey, $label, $group->getId() ] );
		}

		$haystack = '';
		foreach ( $data as [ , $label, $groupId ] ) {
			$haystack .= $label . self::FIELD_DELIMITER . $groupId . "\n";
		}

		return $haystack;
	}
}