1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
|
<?php
declare( strict_types = 1 );
namespace MediaWiki\Extension\Translate\TranslatorInterface;
use Collation;
use MessageGroups;
use SplMinHeap;
use WANObjectCache;
use Wikimedia\LightweightObjectStore\ExpirationAwareness;
/**
* Service for searching message groups and message keys.
* @author Niklas Laxström
* @license GPL-2.0-or-later
*/
class EntitySearch {
private const FIELD_DELIMITER = "\x7F";
/** @var WANObjectCache */
private $cache;
/** @var Collation */
private $collation;
/** @var MessageGroups */
private $messageGroupFactory;
public function __construct( WANObjectCache $cache, Collation $collation, MessageGroups $messageGroupFactory ) {
$this->cache = $cache;
$this->collation = $collation;
$this->messageGroupFactory = $messageGroupFactory;
}
public function searchStaticMessageGroups( string $query, int $maxResults ): array {
$cache = $this->cache;
// None of the static groups currently use language-dependent labels. This
// may need revisiting later and splitting the cache by language.
$key = $cache->makeKey( 'Translate', 'EntitySearch', 'static-groups' );
$haystack = $cache->getWithSetCallback(
$key,
ExpirationAwareness::TTL_HOUR,
function (): string {
return $this->getStaticMessageGroupsHaystack();
},
[
// Calling touchCheckKey() on this key purges the cache
'checkKeys' => [ $this->messageGroupFactory->getCacheKey() ],
// Avoid querying cache servers multiple times in a web request
'pcTTL' => $cache::TTL_PROC_LONG
]
);
// Algorithm: Construct one big string with one entity per line. Then run
// preg_match_all twice over it, first to collect prefix match (to show them
// first), then to match words if more results are needed.
$results = [];
$delimiter = self::FIELD_DELIMITER;
$anything = "[^$delimiter\n]";
$query = preg_quote( $query, '/' );
// Prefix match
$pattern = "/^($query$anything*)$delimiter($anything+)$/miu";
preg_match_all( $pattern, $haystack, $matches, PREG_SET_ORDER );
foreach ( $matches as [ , $label, $groupId ] ) {
// Index by $groupId to avoid duplicates from the prefix match and the word match
$results[$groupId] = [
'label' => $label,
'group' => $groupId,
];
if ( count( $results ) >= $maxResults ) {
return array_values( $results );
}
}
// Word match
$pattern = "/^($anything*\b$query$anything*)$delimiter($anything+)$/miu";
preg_match_all( $pattern, $haystack, $matches, PREG_SET_ORDER );
foreach ( $matches as [ , $label, $groupId ] ) {
$results[$groupId] = [
'label' => $label,
'group' => $groupId,
];
if ( count( $results ) >= $maxResults ) {
return array_values( $results );
}
}
return array_values( $results );
}
private function getStaticMessageGroupsHaystack(): string {
$groups = $this->messageGroupFactory->getGroups();
$data = new SplMinHeap();
foreach ( $groups as $group ) {
$label = $group->getLabel();
// Ensure there are no special chars that will break matching
$label = strtr( $label, [ self::FIELD_DELIMITER => '', "\n" => '' ] );
$sortKey = $this->collation->getSortKey( $label );
// It is unlikely that different groups have the same label (or sort key),
// but it's possible.
$data->insert( [ $sortKey, $label, $group->getId() ] );
}
$haystack = '';
foreach ( $data as [ , $label, $groupId ] ) {
$haystack .= $label . self::FIELD_DELIMITER . $groupId . "\n";
}
return $haystack;
}
}
|