Initial Commit

This commit is contained in:
Netkas 2023-07-23 16:01:28 -04:00
commit 402c3b5cb6
No known key found for this signature in database
GPG key ID: 5DAF58535614062B
27 changed files with 911 additions and 0 deletions

View file

@ -0,0 +1,13 @@
<?php
namespace Keybert\Exceptions;
use Throwable;
class KeybertException extends \Exception
{
public function __construct(string $message = "", int $code = 0, ?Throwable $previous = null)
{
parent::__construct($message, $code, $previous);
}
}

346
src/Keybert/Keybert.php Normal file
View file

@ -0,0 +1,346 @@
<?php
/** @noinspection PhpMissingFieldTypeInspection */
namespace Keybert;
use Exception;
use JsonException;
use Keybert\Exceptions\KeybertException;
use RuntimeException;
use Symfony\Component\Process\ExecutableFinder;
use Symfony\Component\Process\Process;
class Keybert
{
/**
* @var Process|null
*/
private $process;
/**
* @var string|null
*/
private $host;
/**
* @var int
*/
private $port;
/**
* @var bool
*/
private $internal;
/**
* @var array
*/
private $preload;
/**
* @param string|null $host
* @param int|null $port
*/
public function __construct(?array $preload=null, ?string $host=null, ?int $port=null)
{
$this->internal = ($host === null);
$this->preload = $preload ?? [];
if(!$this->internal && $port === null)
{
throw new RuntimeException('If Keybert is running remotely, a port must be specified. (Host is not empty, but the port is.)');
}
if($this->internal)
{
$this->host = '127.0.0.1';
}
else
{
$this->host = $host;
}
$this->port = $port ?? self::getAvailablePort();
// Start the internal Keybert process if we're running locally.
if($this->internal)
{
$this->start();
}
}
/**
* @param string $method
* @param array|null $parameters
* @return mixed
* @throws KeybertException
* @noinspection HttpUrlsUsage
*/
private function invoke(string $method, array $parameters = null): mixed
{
$ch = null;
try
{
$ch = curl_init(sprintf('http://%s:%d/%s', $this->host, $this->port, $method));
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_POSTFIELDS, json_encode($parameters ?? [], JSON_THROW_ON_ERROR));
curl_setopt($ch, CURLOPT_HTTPHEADER, [
'Content-Type: application/json',
]);
$result = curl_exec($ch);
if($result === false)
{
throw new RuntimeException('Failed to invoke Keybert method "' . $method . '".');
}
$json_result = json_decode($result, true, 512, JSON_THROW_ON_ERROR);
$http_code = curl_getinfo($ch, CURLINFO_HTTP_CODE);
if ($http_code !== 200)
{
throw new RuntimeException('HTTP request failed with status code ' . $http_code);
}
}
catch(Exception|JsonException $e)
{
throw new RuntimeException('Failed to invoke Keybert method "' . $method . '".', 0, $e);
}
finally
{
if ($ch)
{
curl_close($ch);
}
$this->printUpdates();
}
if(!isset($json_result['status']) || !$json_result['status'])
{
if(isset($json_result['message']))
{
throw new KeybertException($json_result['message']);
}
throw new KeybertException(sprintf('Failed to invoke Keybert method "%s", %s.', $method, $result));
}
return $json_result['data'];
}
/**
* Starts the internal Keybert process.
*
* @return void
*/
private function start(): void
{
if(!$this->internal)
{
return;
}
try
{
$this->process = new Process([
(new ExecutableFinder())->find('python3'),
__DIR__ . DIRECTORY_SEPARATOR . 'wrapper.py',
]);
$this->process->setEnv([
'KEYBERT_ENABLED' => '1',
'KEYBERT_PORT' => $this->port,
'KEYBERT_PRELOAD' => json_encode($this->preload, JSON_THROW_ON_ERROR),
]);
}
catch(Exception $e)
{
throw new RuntimeException('Failed to start Keybert process.', 0, $e);
}
// Start the process and wait for it to be ready. (2-Minutes timeout)
$this->process->start();
$start_time = time();
while($this->process->getExitCode() === null)
{
$this->printUpdates();
try
{
if($this->invoke('ping'))
{
break;
}
}
catch(Exception $e)
{
if(time() - $start_time > 120)
{
throw new RuntimeException('Keybert wrapper process failed to start.', 0, $e);
}
sleep(1);
continue;
}
}
if($this->process->getExitCode() !== null)
{
throw new RuntimeException(sprintf('Keybert wrapper process exited with code %d.', $this->process->getExitCode()));
}
}
/**
* Stops the internal Keybert process.
*
* @return void
*/
private function stop()
{
if(!$this->process || !$this->process->isRunning())
{
return;
}
$this->process->stop();
$this->process = null;
}
/**
* Prints out the updates from the internal Keybert process.
*
* @return void
*/
private function printUpdates(): void
{
if(!$this->internal || !$this->process)
{
return;
}
print($this->process->getIncrementalOutput());
print($this->process->getIncrementalErrorOutput());
}
/**
* Returns an available port in the given range.
*
* @param string $host
* @param int $start
* @param int $end
* @return int
*/
private static function getAvailablePort(string $host='127.0.0.1', int $start=1024, int $end=65535): int
{
$range = range($start, $end);
shuffle($range);
foreach ($range as $port)
{
$connection = @stream_socket_client('tcp://' . $host . ':' . $port);
if (is_resource($connection))
{
fclose($connection);
}
else
{
return $port;
}
}
throw new RuntimeException('No available port found in range ' . $start . ' to ' . $end . '.');
}
/**
* Runs the internal Keybert process and prints out the updates indefinitely.
*
* @return void
*/
public function run(): void
{
if(!$this->internal)
{
return;
}
while($this->process->isRunning())
{
$this->printUpdates();
sleep(1);
}
}
/**
* Loads a model into Keybert.
*
* @param string $model The model to load, e.g. "paraphrase-multilingual-MiniLM-L12-v2"
* @return void
* @throws KeybertException
*/
public function loadModel(string $model): void
{
$this->invoke('load_model', [
'model_name' => $model,
]);
}
/**
* Extracts keywords from the given documents.
*
* @param string $model The model to use, e.g. "paraphrase-multilingual-MiniLM-L12-v2"
* @param string|array $docs The documents or document to extract keywords from.
* @param array|null $candidates The candidates to use for the keywords (eg; ["NOUN", "PROPN", "ADJ"])
* @param array $keyphrase_ngram_range The ngram range to use for the keyphrases. (eg; [1, 2])
* @param string $stop_words The stop words to use. (eg; "english")
* @param int $top_n The number of keywords to return.
* @param int $min_df The minimum document frequency.
* @param bool $use_maxsum Whether to use the MaxSum algorithm.
* @param bool $use_mmr Whether to use the Maximal Marginal Relevance algorithm.
* @param float $diversity The diversity of the MMR algorithm.
* @param int $nr_candidates The number of candidates to use for the MMR algorithm.
* @param array|null $seed_keywords The seed keywords to use for the MMR algorithm.
* @return array The extracted keywords.
* @throws KeybertException If the extraction failed.
*/
public function extractKeywords(
string $model,
string|array $docs,
array $candidates = null,
array $keyphrase_ngram_range = [1, 1],
string $stop_words = 'english',
int $top_n = 5,
int $min_df = 1,
bool $use_maxsum = false,
bool $use_mmr = false,
float $diversity = 0.5,
int $nr_candidates = 20,
array $seed_keywords = null
): array {
return $this->invoke('extract_keywords', [
'model_name' => $model,
'docs' => $docs, // And this line
'candidates' => $candidates,
'keyphrase_ngram_range' => $keyphrase_ngram_range,
'stop_words' => $stop_words,
'top_n' => $top_n,
'min_df' => $min_df,
'use_maxsum' => $use_maxsum,
'use_mmr' => $use_mmr,
'diversity' => $diversity,
'nr_candidates' => $nr_candidates,
'seed_keywords' => $seed_keywords,
]);
}
/**
* Destructor.
*/
public function __destruct()
{
$this->stop();
}
}

59
src/Keybert/Program.php Normal file
View file

@ -0,0 +1,59 @@
<?php
namespace Keybert;
use ncc\Runtime;
class Program
{
/**
* Main entry point for the CLI
*
* @param array $args
* @return void
*/
public static function main(array $args=[]): void
{
if(getenv('KEYBERT_PORT') === false)
{
$port = $args['port'] ?? $args['p'] ?? null;
}
else
{
$port = getenv('KEYBERT_PORT');
}
if(getenv('KEYBERT_PRELOAD') === false)
{
$preload = $args['preload'] ?? $args['l'] ?? null;
}
else
{
$preload = getenv('KEYBERT_PRELOAD');
}
$keybert = new Keybert($preload, null, $port);
$keybert->run();
}
/**
* Displays the help message
*
* @return void
*/
public static function help(): void
{
print('Keybert v' . Runtime::getConstant('net.nosial.federationlib', 'version') . PHP_EOL . PHP_EOL);
print('Usage: Keybert [options]' . PHP_EOL);
print('Options:' . PHP_EOL);
print(' --port|-p - The port to listen on' . PHP_EOL);
print(' --preload|-l - Optional. Models to preload, seperated by commas' . PHP_EOL);
print(PHP_EOL);
print('Environment Variables:' . PHP_EOL);
print(' KEYBERT_PORT - The port to listen on' . PHP_EOL);
print(' KEYBERT_PRELOAD - Optional. Models to preload, seperated by commas' . PHP_EOL);
exit(0);
}
}

97
src/Keybert/wrapper.py Normal file
View file

@ -0,0 +1,97 @@
from aiohttp import web
from keybert import KeyBERT
import json
import os
app = web.Application()
models = {}
# Ping
async def ping(request):
return web.json_response({'status': True, 'message': None, 'data': True})
# Load a model
async def load_model(request):
try:
data = await request.json()
model_name = data.get('model_name', '')
if model_name:
models[model_name] = KeyBERT(model=model_name)
return web.json_response({'status': True, 'message': None, 'data': f'Model {model_name} loaded.'})
else:
return web.json_response({'status': False, 'message': 'No model name provided.', 'data': None})
except Exception as e:
return web.json_response({'status': False, 'message': str(e), 'data': None})
# Extract keywords
async def extract_keywords(request):
try:
data = await request.json()
model_name = data.get('model_name', '')
docs = data.get('docs', [])
candidates = data.get('candidates', None)
keyphrase_ngram_range = tuple(data.get('keyphrase_ngram_range', (1, 1)))
stop_words = data.get('stop_words', 'english')
top_n = data.get('top_n', 5)
min_df = data.get('min_df', 1)
use_maxsum = data.get('use_maxsum', False)
use_mmr = data.get('use_mmr', False)
diversity = data.get('diversity', 0.5)
nr_candidates = data.get('nr_candidates', 20)
seed_keywords = data.get('seed_keywords', None)
# Check if the model is loaded, if not, load it
if model_name not in models:
models[model_name] = KeyBERT(model=model_name)
if docs:
keywords = models[model_name].extract_keywords(
docs,
candidates=candidates,
keyphrase_ngram_range=keyphrase_ngram_range,
stop_words=stop_words,
top_n=top_n,
min_df=min_df,
use_maxsum=use_maxsum,
use_mmr=use_mmr,
diversity=diversity,
nr_candidates=nr_candidates,
seed_keywords=seed_keywords,
)
# Transform the result from a list of tuples to a dictionary
keywords_dict = {word: score for word, score in keywords}
return web.json_response({'status': True, 'message': None, 'data': keywords_dict})
else:
return web.json_response(
{'status': False, 'message': 'No document provided.', 'data': None})
except Exception as e:
return web.json_response({'status': False, 'message': str(e), 'data': None})
if __name__ == '__main__':
# Check for required environment variables
if 'KEYBERT_ENABLED' not in os.environ or os.environ['KEYBERT_ENABLED'] != '1':
print('Error: KEYBERT_ENABLED environment variable not set or not "true".')
exit(1)
if 'KEYBERT_PORT' not in os.environ:
print('Error: KEYBERT_PORT environment variable not set.')
exit(1)
# Preload models if KEYBERT_PRELOAD is set
if 'KEYBERT_PRELOAD' in os.environ:
preload_models = json.loads(os.environ['KEYBERT_PRELOAD'])
for model_name in preload_models:
models[model_name] = KeyBERT(model=model_name)
print(f'Model {model_name} preloaded.')
# Add routes
app.router.add_post('/ping', ping)
app.router.add_post('/load_model', load_model)
app.router.add_post('/extract_keywords', extract_keywords)
# Start server
web.run_app(app, host='0.0.0.0', port=int(os.environ['KEYBERT_PORT']))