Initial Commit
This commit is contained in:
commit
402c3b5cb6
27 changed files with 911 additions and 0 deletions
13
src/Keybert/Exceptions/KeybertException.php
Normal file
13
src/Keybert/Exceptions/KeybertException.php
Normal file
|
@ -0,0 +1,13 @@
|
|||
<?php
|
||||
|
||||
namespace Keybert\Exceptions;
|
||||
|
||||
use Throwable;
|
||||
|
||||
class KeybertException extends \Exception
|
||||
{
|
||||
public function __construct(string $message = "", int $code = 0, ?Throwable $previous = null)
|
||||
{
|
||||
parent::__construct($message, $code, $previous);
|
||||
}
|
||||
}
|
346
src/Keybert/Keybert.php
Normal file
346
src/Keybert/Keybert.php
Normal file
|
@ -0,0 +1,346 @@
|
|||
<?php
|
||||
|
||||
/** @noinspection PhpMissingFieldTypeInspection */
|
||||
|
||||
namespace Keybert;
|
||||
|
||||
use Exception;
|
||||
use JsonException;
|
||||
use Keybert\Exceptions\KeybertException;
|
||||
use RuntimeException;
|
||||
use Symfony\Component\Process\ExecutableFinder;
|
||||
use Symfony\Component\Process\Process;
|
||||
|
||||
class Keybert
|
||||
{
|
||||
/**
|
||||
* @var Process|null
|
||||
*/
|
||||
private $process;
|
||||
|
||||
/**
|
||||
* @var string|null
|
||||
*/
|
||||
private $host;
|
||||
|
||||
/**
|
||||
* @var int
|
||||
*/
|
||||
private $port;
|
||||
|
||||
/**
|
||||
* @var bool
|
||||
*/
|
||||
private $internal;
|
||||
|
||||
/**
|
||||
* @var array
|
||||
*/
|
||||
private $preload;
|
||||
|
||||
/**
|
||||
* @param string|null $host
|
||||
* @param int|null $port
|
||||
*/
|
||||
public function __construct(?array $preload=null, ?string $host=null, ?int $port=null)
|
||||
{
|
||||
$this->internal = ($host === null);
|
||||
$this->preload = $preload ?? [];
|
||||
|
||||
if(!$this->internal && $port === null)
|
||||
{
|
||||
throw new RuntimeException('If Keybert is running remotely, a port must be specified. (Host is not empty, but the port is.)');
|
||||
}
|
||||
|
||||
if($this->internal)
|
||||
{
|
||||
$this->host = '127.0.0.1';
|
||||
}
|
||||
else
|
||||
{
|
||||
$this->host = $host;
|
||||
}
|
||||
|
||||
$this->port = $port ?? self::getAvailablePort();
|
||||
|
||||
// Start the internal Keybert process if we're running locally.
|
||||
if($this->internal)
|
||||
{
|
||||
$this->start();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @param string $method
|
||||
* @param array|null $parameters
|
||||
* @return mixed
|
||||
* @throws KeybertException
|
||||
* @noinspection HttpUrlsUsage
|
||||
*/
|
||||
private function invoke(string $method, array $parameters = null): mixed
|
||||
{
|
||||
$ch = null;
|
||||
|
||||
try
|
||||
{
|
||||
$ch = curl_init(sprintf('http://%s:%d/%s', $this->host, $this->port, $method));
|
||||
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
|
||||
curl_setopt($ch, CURLOPT_POSTFIELDS, json_encode($parameters ?? [], JSON_THROW_ON_ERROR));
|
||||
curl_setopt($ch, CURLOPT_HTTPHEADER, [
|
||||
'Content-Type: application/json',
|
||||
]);
|
||||
|
||||
$result = curl_exec($ch);
|
||||
if($result === false)
|
||||
{
|
||||
throw new RuntimeException('Failed to invoke Keybert method "' . $method . '".');
|
||||
}
|
||||
$json_result = json_decode($result, true, 512, JSON_THROW_ON_ERROR);
|
||||
|
||||
$http_code = curl_getinfo($ch, CURLINFO_HTTP_CODE);
|
||||
|
||||
if ($http_code !== 200)
|
||||
{
|
||||
throw new RuntimeException('HTTP request failed with status code ' . $http_code);
|
||||
}
|
||||
}
|
||||
catch(Exception|JsonException $e)
|
||||
{
|
||||
throw new RuntimeException('Failed to invoke Keybert method "' . $method . '".', 0, $e);
|
||||
}
|
||||
finally
|
||||
{
|
||||
if ($ch)
|
||||
{
|
||||
curl_close($ch);
|
||||
}
|
||||
|
||||
$this->printUpdates();
|
||||
}
|
||||
|
||||
if(!isset($json_result['status']) || !$json_result['status'])
|
||||
{
|
||||
if(isset($json_result['message']))
|
||||
{
|
||||
throw new KeybertException($json_result['message']);
|
||||
}
|
||||
|
||||
throw new KeybertException(sprintf('Failed to invoke Keybert method "%s", %s.', $method, $result));
|
||||
}
|
||||
|
||||
return $json_result['data'];
|
||||
}
|
||||
|
||||
/**
|
||||
* Starts the internal Keybert process.
|
||||
*
|
||||
* @return void
|
||||
*/
|
||||
private function start(): void
|
||||
{
|
||||
if(!$this->internal)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
try
|
||||
{
|
||||
$this->process = new Process([
|
||||
(new ExecutableFinder())->find('python3'),
|
||||
__DIR__ . DIRECTORY_SEPARATOR . 'wrapper.py',
|
||||
]);
|
||||
|
||||
$this->process->setEnv([
|
||||
'KEYBERT_ENABLED' => '1',
|
||||
'KEYBERT_PORT' => $this->port,
|
||||
'KEYBERT_PRELOAD' => json_encode($this->preload, JSON_THROW_ON_ERROR),
|
||||
]);
|
||||
}
|
||||
catch(Exception $e)
|
||||
{
|
||||
throw new RuntimeException('Failed to start Keybert process.', 0, $e);
|
||||
}
|
||||
|
||||
// Start the process and wait for it to be ready. (2-Minutes timeout)
|
||||
$this->process->start();
|
||||
$start_time = time();
|
||||
|
||||
while($this->process->getExitCode() === null)
|
||||
{
|
||||
$this->printUpdates();
|
||||
|
||||
try
|
||||
{
|
||||
if($this->invoke('ping'))
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
catch(Exception $e)
|
||||
{
|
||||
if(time() - $start_time > 120)
|
||||
{
|
||||
throw new RuntimeException('Keybert wrapper process failed to start.', 0, $e);
|
||||
}
|
||||
|
||||
sleep(1);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
if($this->process->getExitCode() !== null)
|
||||
{
|
||||
throw new RuntimeException(sprintf('Keybert wrapper process exited with code %d.', $this->process->getExitCode()));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Stops the internal Keybert process.
|
||||
*
|
||||
* @return void
|
||||
*/
|
||||
private function stop()
|
||||
{
|
||||
if(!$this->process || !$this->process->isRunning())
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
$this->process->stop();
|
||||
$this->process = null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Prints out the updates from the internal Keybert process.
|
||||
*
|
||||
* @return void
|
||||
*/
|
||||
private function printUpdates(): void
|
||||
{
|
||||
if(!$this->internal || !$this->process)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
print($this->process->getIncrementalOutput());
|
||||
print($this->process->getIncrementalErrorOutput());
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns an available port in the given range.
|
||||
*
|
||||
* @param string $host
|
||||
* @param int $start
|
||||
* @param int $end
|
||||
* @return int
|
||||
*/
|
||||
private static function getAvailablePort(string $host='127.0.0.1', int $start=1024, int $end=65535): int
|
||||
{
|
||||
$range = range($start, $end);
|
||||
shuffle($range);
|
||||
foreach ($range as $port)
|
||||
{
|
||||
$connection = @stream_socket_client('tcp://' . $host . ':' . $port);
|
||||
if (is_resource($connection))
|
||||
{
|
||||
fclose($connection);
|
||||
}
|
||||
else
|
||||
{
|
||||
return $port;
|
||||
}
|
||||
}
|
||||
|
||||
throw new RuntimeException('No available port found in range ' . $start . ' to ' . $end . '.');
|
||||
}
|
||||
|
||||
/**
|
||||
* Runs the internal Keybert process and prints out the updates indefinitely.
|
||||
*
|
||||
* @return void
|
||||
*/
|
||||
public function run(): void
|
||||
{
|
||||
if(!$this->internal)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
while($this->process->isRunning())
|
||||
{
|
||||
$this->printUpdates();
|
||||
sleep(1);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Loads a model into Keybert.
|
||||
*
|
||||
* @param string $model The model to load, e.g. "paraphrase-multilingual-MiniLM-L12-v2"
|
||||
* @return void
|
||||
* @throws KeybertException
|
||||
*/
|
||||
public function loadModel(string $model): void
|
||||
{
|
||||
$this->invoke('load_model', [
|
||||
'model_name' => $model,
|
||||
]);
|
||||
}
|
||||
|
||||
/**
|
||||
* Extracts keywords from the given documents.
|
||||
*
|
||||
* @param string $model The model to use, e.g. "paraphrase-multilingual-MiniLM-L12-v2"
|
||||
* @param string|array $docs The documents or document to extract keywords from.
|
||||
* @param array|null $candidates The candidates to use for the keywords (eg; ["NOUN", "PROPN", "ADJ"])
|
||||
* @param array $keyphrase_ngram_range The ngram range to use for the keyphrases. (eg; [1, 2])
|
||||
* @param string $stop_words The stop words to use. (eg; "english")
|
||||
* @param int $top_n The number of keywords to return.
|
||||
* @param int $min_df The minimum document frequency.
|
||||
* @param bool $use_maxsum Whether to use the MaxSum algorithm.
|
||||
* @param bool $use_mmr Whether to use the Maximal Marginal Relevance algorithm.
|
||||
* @param float $diversity The diversity of the MMR algorithm.
|
||||
* @param int $nr_candidates The number of candidates to use for the MMR algorithm.
|
||||
* @param array|null $seed_keywords The seed keywords to use for the MMR algorithm.
|
||||
* @return array The extracted keywords.
|
||||
* @throws KeybertException If the extraction failed.
|
||||
*/
|
||||
public function extractKeywords(
|
||||
string $model,
|
||||
string|array $docs,
|
||||
array $candidates = null,
|
||||
array $keyphrase_ngram_range = [1, 1],
|
||||
string $stop_words = 'english',
|
||||
int $top_n = 5,
|
||||
int $min_df = 1,
|
||||
bool $use_maxsum = false,
|
||||
bool $use_mmr = false,
|
||||
float $diversity = 0.5,
|
||||
int $nr_candidates = 20,
|
||||
array $seed_keywords = null
|
||||
): array {
|
||||
return $this->invoke('extract_keywords', [
|
||||
'model_name' => $model,
|
||||
'docs' => $docs, // And this line
|
||||
'candidates' => $candidates,
|
||||
'keyphrase_ngram_range' => $keyphrase_ngram_range,
|
||||
'stop_words' => $stop_words,
|
||||
'top_n' => $top_n,
|
||||
'min_df' => $min_df,
|
||||
'use_maxsum' => $use_maxsum,
|
||||
'use_mmr' => $use_mmr,
|
||||
'diversity' => $diversity,
|
||||
'nr_candidates' => $nr_candidates,
|
||||
'seed_keywords' => $seed_keywords,
|
||||
]);
|
||||
}
|
||||
|
||||
/**
|
||||
* Destructor.
|
||||
*/
|
||||
public function __destruct()
|
||||
{
|
||||
$this->stop();
|
||||
}
|
||||
|
||||
}
|
59
src/Keybert/Program.php
Normal file
59
src/Keybert/Program.php
Normal file
|
@ -0,0 +1,59 @@
|
|||
<?php
|
||||
|
||||
namespace Keybert;
|
||||
|
||||
use ncc\Runtime;
|
||||
|
||||
class Program
|
||||
{
|
||||
/**
|
||||
* Main entry point for the CLI
|
||||
*
|
||||
* @param array $args
|
||||
* @return void
|
||||
*/
|
||||
public static function main(array $args=[]): void
|
||||
{
|
||||
if(getenv('KEYBERT_PORT') === false)
|
||||
{
|
||||
$port = $args['port'] ?? $args['p'] ?? null;
|
||||
}
|
||||
else
|
||||
{
|
||||
$port = getenv('KEYBERT_PORT');
|
||||
}
|
||||
|
||||
if(getenv('KEYBERT_PRELOAD') === false)
|
||||
{
|
||||
$preload = $args['preload'] ?? $args['l'] ?? null;
|
||||
}
|
||||
else
|
||||
{
|
||||
$preload = getenv('KEYBERT_PRELOAD');
|
||||
}
|
||||
|
||||
$keybert = new Keybert($preload, null, $port);
|
||||
$keybert->run();
|
||||
}
|
||||
|
||||
/**
|
||||
* Displays the help message
|
||||
*
|
||||
* @return void
|
||||
*/
|
||||
public static function help(): void
|
||||
{
|
||||
print('Keybert v' . Runtime::getConstant('net.nosial.federationlib', 'version') . PHP_EOL . PHP_EOL);
|
||||
|
||||
print('Usage: Keybert [options]' . PHP_EOL);
|
||||
print('Options:' . PHP_EOL);
|
||||
print(' --port|-p - The port to listen on' . PHP_EOL);
|
||||
print(' --preload|-l - Optional. Models to preload, seperated by commas' . PHP_EOL);
|
||||
print(PHP_EOL);
|
||||
print('Environment Variables:' . PHP_EOL);
|
||||
print(' KEYBERT_PORT - The port to listen on' . PHP_EOL);
|
||||
print(' KEYBERT_PRELOAD - Optional. Models to preload, seperated by commas' . PHP_EOL);
|
||||
|
||||
exit(0);
|
||||
}
|
||||
}
|
97
src/Keybert/wrapper.py
Normal file
97
src/Keybert/wrapper.py
Normal file
|
@ -0,0 +1,97 @@
|
|||
from aiohttp import web
|
||||
from keybert import KeyBERT
|
||||
import json
|
||||
import os
|
||||
|
||||
app = web.Application()
|
||||
models = {}
|
||||
|
||||
|
||||
# Ping
|
||||
async def ping(request):
|
||||
return web.json_response({'status': True, 'message': None, 'data': True})
|
||||
|
||||
|
||||
# Load a model
|
||||
async def load_model(request):
|
||||
try:
|
||||
data = await request.json()
|
||||
model_name = data.get('model_name', '')
|
||||
if model_name:
|
||||
models[model_name] = KeyBERT(model=model_name)
|
||||
return web.json_response({'status': True, 'message': None, 'data': f'Model {model_name} loaded.'})
|
||||
else:
|
||||
return web.json_response({'status': False, 'message': 'No model name provided.', 'data': None})
|
||||
except Exception as e:
|
||||
return web.json_response({'status': False, 'message': str(e), 'data': None})
|
||||
|
||||
|
||||
# Extract keywords
|
||||
async def extract_keywords(request):
|
||||
try:
|
||||
data = await request.json()
|
||||
model_name = data.get('model_name', '')
|
||||
docs = data.get('docs', [])
|
||||
candidates = data.get('candidates', None)
|
||||
keyphrase_ngram_range = tuple(data.get('keyphrase_ngram_range', (1, 1)))
|
||||
stop_words = data.get('stop_words', 'english')
|
||||
top_n = data.get('top_n', 5)
|
||||
min_df = data.get('min_df', 1)
|
||||
use_maxsum = data.get('use_maxsum', False)
|
||||
use_mmr = data.get('use_mmr', False)
|
||||
diversity = data.get('diversity', 0.5)
|
||||
nr_candidates = data.get('nr_candidates', 20)
|
||||
seed_keywords = data.get('seed_keywords', None)
|
||||
|
||||
# Check if the model is loaded, if not, load it
|
||||
if model_name not in models:
|
||||
models[model_name] = KeyBERT(model=model_name)
|
||||
|
||||
if docs:
|
||||
keywords = models[model_name].extract_keywords(
|
||||
docs,
|
||||
candidates=candidates,
|
||||
keyphrase_ngram_range=keyphrase_ngram_range,
|
||||
stop_words=stop_words,
|
||||
top_n=top_n,
|
||||
min_df=min_df,
|
||||
use_maxsum=use_maxsum,
|
||||
use_mmr=use_mmr,
|
||||
diversity=diversity,
|
||||
nr_candidates=nr_candidates,
|
||||
seed_keywords=seed_keywords,
|
||||
)
|
||||
# Transform the result from a list of tuples to a dictionary
|
||||
keywords_dict = {word: score for word, score in keywords}
|
||||
return web.json_response({'status': True, 'message': None, 'data': keywords_dict})
|
||||
else:
|
||||
return web.json_response(
|
||||
{'status': False, 'message': 'No document provided.', 'data': None})
|
||||
except Exception as e:
|
||||
return web.json_response({'status': False, 'message': str(e), 'data': None})
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
# Check for required environment variables
|
||||
if 'KEYBERT_ENABLED' not in os.environ or os.environ['KEYBERT_ENABLED'] != '1':
|
||||
print('Error: KEYBERT_ENABLED environment variable not set or not "true".')
|
||||
exit(1)
|
||||
|
||||
if 'KEYBERT_PORT' not in os.environ:
|
||||
print('Error: KEYBERT_PORT environment variable not set.')
|
||||
exit(1)
|
||||
|
||||
# Preload models if KEYBERT_PRELOAD is set
|
||||
if 'KEYBERT_PRELOAD' in os.environ:
|
||||
preload_models = json.loads(os.environ['KEYBERT_PRELOAD'])
|
||||
for model_name in preload_models:
|
||||
models[model_name] = KeyBERT(model=model_name)
|
||||
print(f'Model {model_name} preloaded.')
|
||||
|
||||
# Add routes
|
||||
app.router.add_post('/ping', ping)
|
||||
app.router.add_post('/load_model', load_model)
|
||||
app.router.add_post('/extract_keywords', extract_keywords)
|
||||
|
||||
# Start server
|
||||
web.run_app(app, host='0.0.0.0', port=int(os.environ['KEYBERT_PORT']))
|
Loading…
Add table
Add a link
Reference in a new issue