@sbrl/

Peppermint STAS

PHP CLI

The experimental Pepperminty Wiki Search Term Analysis System (STAS)

fork
loading
Files
  • main.php
main.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
<?php

/**
 * Splits a *transliterated* query string into tokens.
 * Actually based on my earlier explode_adv https://starbeamrainbowlabs.com/blog/article.php?article=posts/081-PHP-String-Splitting.html
 * @param	string	$query	The queyr string to split.
 */
function stas_split($query) {
	$chars = str_split($query);
	$terms = [];
	$next_term = "";
	$toggle_state = false; // true = now inside, false = now outside
	foreach($chars as $char)
	{
		if($char == '"') {
			// Invert the toggle block state
			$toggle_state = !$toggle_state;
		}
		
		// If this char is whitespace *and* we're outside a toggle block, then it's a token
		if(ctype_space($char) && !$toggle_state) {
			// If the string is empty, then don't bother
			if(empty($next_term)) continue;
			$terms[] = $next_term;
			$next_term = "";
		}
		// If it's not whitespace, or it is whitespace and we're inside a toggle block....
		else if(!ctype_space($char) || ($toggle_state && ctype_space($char)))
			$next_term .= $char; // ...then add the char to the next part
	}
	
	if(strlen($next_term) > 0)
		$terms[] = $next_term;

	return $terms;
}

/**
 * Parses an array of query tokens into an associative array of search directives.
 * Supported syntax derived from these sources:
	 * https://help.duckduckgo.com/duckduckgo-help-pages/results/syntax/
	 * https://docs.microsoft.com/en-us/windows/win32/lwef/-search-2x-wds-aqsreference

 * @param	string[]	$tokens	The array of query tokens to parse.
 */
function stas_parse($tokens) {
	/* Supported Syntax *
	 * 
	 * -term				exclude a term
	 * +term				double the weighting of a term
	 * terms !dest terms	redirect entire query (minus the !bang) to interwiki with registered shortcut dest
	 * prefix:term			apply prefix operator to term
	 */
	var_dump($tokens);
	$result = [
		"terms" => [],
		"exclude" => [],
		"interwiki" => null
	];
	// foreach($operators as $op)
	// 	$result[$op] = [];


	$count = count($tokens);
	for($i = count($tokens) - 1; $i >= 0; $i--) {
		// Look for excludes
		if($tokens[$i][0] == "-") {
			$result["exclude"][] = substr($tokens[$i], 1);
			continue;
		}

		// Look for weighted terms
		if($tokens[$i][0] == "+") {
			$result["terms"][] = [
				"term" => substr($tokens[$i], 1),
				"weight" => 2
			];
			continue;
		}

		// Look for interwiki searches
		if($tokens[$i][0] == "!" || substr($tokens[$i], -1) == "!") {
			// You can only go to 1 interwiki destination at once, so we replace any previous finding with this one
			$result["interwiki"] = trim($tokens[$i], "!");
		}

		// Look for colon directives in the form directive:term
		// Also supports prefix:"quoted term with spaces", quotes stripped automatically
		/*** Example directives ***
		 * intitle		search only page titles for term
		 * intags		search only tags for term
		 * inpage		search page only for term
		 * before		search only pages that were last modified before term
		 * after		search only pages that were last modified after term
		 * size			search only pages that match the size spec term (e.g. 1k+ -> more than 1k bytes, 2k- -> less than 2k bytes, >5k -> more than 5k bytes, <10k -> less than 10k bytes)
		 **************************/
		if(strpos($tokens[$i], ":") !== false) {
			$parts = explode(":", $tokens[$i], 2);
			if(!isset($result[$parts[0]]))
				$result[$parts[0]] = [];
			$result[$parts[0]][] = trim($parts[1], '"');
			continue;
		}

		// Doesn't appear to be particularly special *shrugs*
		$result["terms"][] = [
			"term" => $tokens[$i],
			"weight" => 1
		];
	}

	return $result;
}

function split_simple($str) {
	return explode(" ", $str);
}

echo("*** Experimental Pepperminty Wiki STA System ***\n");


var_dump(stas_parse(stas_split("    testing \t \t -dogs +cats intitle:rabbit date:\"3 days ago\" size:>1k")));
exit();

echo("Enter space-separated search terms:\n");
echo("> ");
var_dump(stas_parse(
	stas_split(trim(readline()))
));