Initial Commit
This commit is contained in:
commit
402c3b5cb6
27 changed files with 911 additions and 0 deletions
1
.gitignore
vendored
Normal file
1
.gitignore
vendored
Normal file
|
@ -0,0 +1 @@
|
|||
build/
|
65
.gitlab-ci.yml
Normal file
65
.gitlab-ci.yml
Normal file
|
@ -0,0 +1,65 @@
|
|||
image: php:8.1
|
||||
|
||||
before_script:
|
||||
# Install some stuff that the image doesn't come with
|
||||
- apt update -yqq
|
||||
- apt install git libpq-dev libzip-dev zip make wget gnupg -yqq
|
||||
|
||||
# Install phive
|
||||
- wget -O phive.phar https://phar.io/releases/phive.phar
|
||||
- wget -O phive.phar.asc https://phar.io/releases/phive.phar.asc
|
||||
- gpg --keyserver hkps://keys.openpgp.org --recv-keys 0x9D8A98B29B2D5D79
|
||||
- gpg --verify phive.phar.asc phive.phar
|
||||
- chmod +x phive.phar
|
||||
- mv phive.phar /usr/local/bin/phive
|
||||
|
||||
# Install phab
|
||||
- phive install phpab --global --trust-gpg-keys 0x2A8299CE842DD38C
|
||||
|
||||
# Install the latest version of ncc (Nosial Code Compiler)
|
||||
- git clone https://git.n64.cc/nosial/ncc.git
|
||||
- cd ncc
|
||||
- make redist
|
||||
- php build/src/INSTALL --auto --install-composer
|
||||
- cd .. && rm -rf ncc
|
||||
|
||||
# Prepare submodules
|
||||
- git config --global url."https://${CI_ACCESS_USERNAME}:${CI_ACCESS_TOKEN}@git.n64.cc/".insteadOf "https://git.n64.cc/"
|
||||
- git submodule sync --recursive
|
||||
- git submodule update --init --recursive
|
||||
|
||||
build:
|
||||
stage: build
|
||||
script:
|
||||
- ncc build --config release --log-level debug
|
||||
artifacts:
|
||||
paths:
|
||||
- build/
|
||||
rules:
|
||||
- if: $CI_COMMIT_BRANCH
|
||||
|
||||
release:
|
||||
stage: deploy
|
||||
script:
|
||||
- ncc build --config release --log-level debug
|
||||
- >
|
||||
curl --header "JOB-TOKEN: $CI_JOB_TOKEN" --upload-file build/release/net.nosial.keybert.ncc "$CI_API_V4_URL/projects/$CI_PROJECT_ID/packages/generic/net.nosial.keybert/$CI_COMMIT_REF_NAME/net.nosial.keybert.ncc"
|
||||
artifacts:
|
||||
paths:
|
||||
- build/
|
||||
rules:
|
||||
- if: $CI_COMMIT_TAG
|
||||
|
||||
docker-build:
|
||||
stage: deploy
|
||||
image: docker:latest
|
||||
services:
|
||||
- docker:dind
|
||||
before_script:
|
||||
- docker login -u "$CI_REGISTRY_USER" -p "$CI_REGISTRY_PASSWORD" $CI_REGISTRY
|
||||
script:
|
||||
- docker build --no-cache -t $CI_REGISTRY_IMAGE:$CI_COMMIT_REF_NAME .
|
||||
- docker push $CI_REGISTRY_IMAGE:$CI_COMMIT_REF_NAME
|
||||
only:
|
||||
- branches
|
||||
- tags
|
8
.idea/.gitignore
generated
vendored
Normal file
8
.idea/.gitignore
generated
vendored
Normal file
|
@ -0,0 +1,8 @@
|
|||
# Default ignored files
|
||||
/shelf/
|
||||
/workspace.xml
|
||||
# Editor-based HTTP Client requests
|
||||
/httpRequests/
|
||||
# Datasource local storage ignored files
|
||||
/dataSources/
|
||||
/dataSources.local.xml
|
1
.idea/.name
generated
Normal file
1
.idea/.name
generated
Normal file
|
@ -0,0 +1 @@
|
|||
Keybert
|
11
.idea/Keybert.iml
generated
Normal file
11
.idea/Keybert.iml
generated
Normal file
|
@ -0,0 +1,11 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<module type="WEB_MODULE" version="4">
|
||||
<component name="NewModuleRootManager">
|
||||
<content url="file://$MODULE_DIR$">
|
||||
<sourceFolder url="file://$MODULE_DIR$/src" isTestSource="false" />
|
||||
<sourceFolder url="file://$MODULE_DIR$/tests" isTestSource="true" />
|
||||
</content>
|
||||
<orderEntry type="inheritedJdk" />
|
||||
<orderEntry type="sourceFolder" forTests="false" />
|
||||
</component>
|
||||
</module>
|
29
.idea/inspectionProfiles/Project_Default.xml
generated
Normal file
29
.idea/inspectionProfiles/Project_Default.xml
generated
Normal file
|
@ -0,0 +1,29 @@
|
|||
<component name="InspectionProjectProfileManager">
|
||||
<profile version="1.0">
|
||||
<option name="myName" value="Project Default" />
|
||||
<inspection_tool class="IncorrectHttpHeaderInspection" enabled="true" level="WARNING" enabled_by_default="true">
|
||||
<option name="customHeaders">
|
||||
<set>
|
||||
<option value="Subject" />
|
||||
<option value="Reply-To" />
|
||||
<option value="X-JSON-Schema" />
|
||||
<option value="X-JSON-Type" />
|
||||
<option value="X-JSON-Path" />
|
||||
<option value="X-Java-Type" />
|
||||
<option value="X-Region-Id" />
|
||||
<option value="X-GraphQL-Variables" />
|
||||
<option value="X-SSH-Private-Key" />
|
||||
<option value="X-Temperature" />
|
||||
<option value="X-Model" />
|
||||
<option value="X-OPENAI-API-KEY" />
|
||||
<option value="X-Args-0" />
|
||||
<option value="X-Args-1" />
|
||||
<option value="X-Args-2" />
|
||||
<option value="X-Args-3" />
|
||||
<option value="X-Args-4" />
|
||||
<option value="X-Args-5" />
|
||||
</set>
|
||||
</option>
|
||||
</inspection_tool>
|
||||
</profile>
|
||||
</component>
|
4
.idea/misc.xml
generated
Normal file
4
.idea/misc.xml
generated
Normal file
|
@ -0,0 +1,4 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.10" project-jdk-type="Python SDK" />
|
||||
</project>
|
8
.idea/modules.xml
generated
Normal file
8
.idea/modules.xml
generated
Normal file
|
@ -0,0 +1,8 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="ProjectModuleManager">
|
||||
<modules>
|
||||
<module fileurl="file://$PROJECT_DIR$/.idea/Keybert.iml" filepath="$PROJECT_DIR$/.idea/Keybert.iml" />
|
||||
</modules>
|
||||
</component>
|
||||
</project>
|
28
.idea/php.xml
generated
Normal file
28
.idea/php.xml
generated
Normal file
|
@ -0,0 +1,28 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="MessDetectorOptionsConfiguration">
|
||||
<option name="transferred" value="true" />
|
||||
</component>
|
||||
<component name="PHPCSFixerOptionsConfiguration">
|
||||
<option name="transferred" value="true" />
|
||||
</component>
|
||||
<component name="PHPCodeSnifferOptionsConfiguration">
|
||||
<option name="highlightLevel" value="WARNING" />
|
||||
<option name="transferred" value="true" />
|
||||
</component>
|
||||
<component name="PhpIncludePathManager">
|
||||
<include_path>
|
||||
<path value="/usr/share/php" />
|
||||
<path value="/etc/ncc" />
|
||||
<path value="/var/ncc/packages/com.symfony.process=6.2.10" />
|
||||
<path value="/var/ncc/packages/net.nosial.optslib=1.0.0" />
|
||||
</include_path>
|
||||
</component>
|
||||
<component name="PhpProjectSharedConfiguration" php_language_level="8.2" />
|
||||
<component name="PhpStanOptionsConfiguration">
|
||||
<option name="transferred" value="true" />
|
||||
</component>
|
||||
<component name="PsalmOptionsConfiguration">
|
||||
<option name="transferred" value="true" />
|
||||
</component>
|
||||
</project>
|
10
.idea/runConfigurations/Build.xml
generated
Normal file
10
.idea/runConfigurations/Build.xml
generated
Normal file
|
@ -0,0 +1,10 @@
|
|||
<component name="ProjectRunConfigurationManager">
|
||||
<configuration default="false" name="Build" type="MAKEFILE_TARGET_RUN_CONFIGURATION" factoryName="Makefile" activateToolWindowBeforeRun="false">
|
||||
<makefile filename="$PROJECT_DIR$/Makefile" target="build" workingDirectory="" arguments="">
|
||||
<envs />
|
||||
</makefile>
|
||||
<method v="2">
|
||||
<option name="RunConfigurationTask" enabled="false" run_configuration_name="Clean" run_configuration_type="MAKEFILE_TARGET_RUN_CONFIGURATION" />
|
||||
</method>
|
||||
</configuration>
|
||||
</component>
|
8
.idea/runConfigurations/Clean.xml
generated
Normal file
8
.idea/runConfigurations/Clean.xml
generated
Normal file
|
@ -0,0 +1,8 @@
|
|||
<component name="ProjectRunConfigurationManager">
|
||||
<configuration default="false" name="Clean" type="MAKEFILE_TARGET_RUN_CONFIGURATION" factoryName="Makefile" activateToolWindowBeforeRun="false">
|
||||
<makefile filename="$PROJECT_DIR$/Makefile" target="clean" workingDirectory="" arguments="">
|
||||
<envs />
|
||||
</makefile>
|
||||
<method v="2" />
|
||||
</configuration>
|
||||
</component>
|
10
.idea/runConfigurations/Install.xml
generated
Normal file
10
.idea/runConfigurations/Install.xml
generated
Normal file
|
@ -0,0 +1,10 @@
|
|||
<component name="ProjectRunConfigurationManager">
|
||||
<configuration default="false" name="Install" type="MAKEFILE_TARGET_RUN_CONFIGURATION" factoryName="Makefile" activateToolWindowBeforeRun="false">
|
||||
<makefile filename="$PROJECT_DIR$/Makefile" target="install" workingDirectory="" arguments="">
|
||||
<envs />
|
||||
</makefile>
|
||||
<method v="2">
|
||||
<option name="RunConfigurationTask" enabled="false" run_configuration_name="Build" run_configuration_type="MAKEFILE_TARGET_RUN_CONFIGURATION" />
|
||||
</method>
|
||||
</configuration>
|
||||
</component>
|
6
.idea/vcs.xml
generated
Normal file
6
.idea/vcs.xml
generated
Normal file
|
@ -0,0 +1,6 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="VcsDirectoryMappings">
|
||||
<mapping directory="" vcs="Git" />
|
||||
</component>
|
||||
</project>
|
12
CHANGELOG.md
Executable file
12
CHANGELOG.md
Executable file
|
@ -0,0 +1,12 @@
|
|||
# Changelog
|
||||
|
||||
All notable changes to this project will be documented in this file.
|
||||
|
||||
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
||||
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
||||
|
||||
|
||||
## [1.0.0] - 2023-07-23
|
||||
|
||||
### Added
|
||||
* First Release
|
50
Dockerfile
Normal file
50
Dockerfile
Normal file
|
@ -0,0 +1,50 @@
|
|||
# Dockerfile
|
||||
# Use an existing docker image as a base
|
||||
FROM repo.n64.cc:443/nosial/ncc:debian as builder
|
||||
|
||||
# Metadata
|
||||
LABEL maintainer="Netkas <netkas@nosial.net>"
|
||||
LABEL version="1.0"
|
||||
LABEL description="Keybert"
|
||||
|
||||
# Set working directory in the container
|
||||
WORKDIR /keybert
|
||||
|
||||
# Increase PHP memory limit
|
||||
RUN echo "memory_limit=-1" > /usr/local/etc/php/conf.d/memory-limit.ini
|
||||
|
||||
# Copy local code to the container
|
||||
COPY . ./
|
||||
|
||||
# Install build dependencies for numpy and scipy
|
||||
RUN apt-get update && apt-get install -y \
|
||||
zip \
|
||||
unzip \
|
||||
build-essential \
|
||||
gfortran \
|
||||
python3 \
|
||||
python3-pip \
|
||||
python3-dev \
|
||||
python3-venv
|
||||
|
||||
# Create a Python virtual environment
|
||||
RUN python3 -m venv /opt/venv
|
||||
# Make sure we use the virtualenv:
|
||||
ENV PATH="/opt/venv/bin:$PATH"
|
||||
|
||||
# Install aiohttp and Keybert
|
||||
RUN pip install --upgrade pip \
|
||||
&& pip install aiohttp keybert
|
||||
|
||||
# Build and install Keybert
|
||||
RUN ncc build --build-configuration release --log-level debug \
|
||||
&& ncc package install -p "build/release/net.nosial.keybert.ncc" --log-level debug -y
|
||||
|
||||
# Clean up unnecessary files
|
||||
RUN rm -rf /keybert
|
||||
|
||||
# Set working directory in the container
|
||||
WORKDIR /
|
||||
|
||||
# Execute focuscrawler
|
||||
CMD ["ncc", "exec", "--package", "net.nosial.keybert", "--exec-unit", "main", "--exec-args", "run"]
|
14
LICENSE
Normal file
14
LICENSE
Normal file
|
@ -0,0 +1,14 @@
|
|||
Copyright 2022-2023 Nosial
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
|
||||
documentation files (the “Software”), to deal in the Software without restriction, including without limitation the
|
||||
rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit
|
||||
persons to whom the Software is furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
|
||||
WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
|
||||
COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
|
||||
OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
8
Makefile
Normal file
8
Makefile
Normal file
|
@ -0,0 +1,8 @@
|
|||
clean:
|
||||
if [ -d build ]; then rm -rf build; fi
|
||||
|
||||
build:
|
||||
ncc build --build-configuration release --log-level debug
|
||||
|
||||
install:
|
||||
sudo ncc package install -p "build/release/net.nosial.keybert.ncc" --reinstall --skip-dependencies --log-level debug -y
|
14
README.md
Normal file
14
README.md
Normal file
|
@ -0,0 +1,14 @@
|
|||
# Keybert
|
||||
|
||||
KeyBERT is a minimal and easy-to-use keyword extraction technique that leverages BERT embeddings to create keywords and
|
||||
keyphrases that are most similar to a document.
|
||||
|
||||
This library is a PHP wrapper for the [KeyBERT](https://https://github.com/MaartenGr/KeyBERT) Python library, allowing
|
||||
you to use it in your PHP projects or spawn it as a individual process to use from another machine on your network.
|
||||
|
||||
This documentation is a work in progress. Please check back later for more information.
|
||||
|
||||
|
||||
# License
|
||||
|
||||
This project is licensed under MIT. Please see the [LICENSE](LICENSE) file for more information.
|
15
docker-compose.yml
Normal file
15
docker-compose.yml
Normal file
|
@ -0,0 +1,15 @@
|
|||
version: '3'
|
||||
|
||||
services:
|
||||
keybert:
|
||||
build:
|
||||
context: .
|
||||
dockerfile: Dockerfile
|
||||
container_name: keybert
|
||||
restart: unless-stopped
|
||||
tty: true
|
||||
environment:
|
||||
- KEYBERT_PORT=2131
|
||||
- KEYBERT_PRELOAD=["paraphrase-multilingual-MiniLM-L12-v2"]
|
||||
ports:
|
||||
- "2131:2131"
|
6
main
Normal file
6
main
Normal file
|
@ -0,0 +1,6 @@
|
|||
<?php
|
||||
|
||||
require 'ncc';
|
||||
import('net.nosial.keybert');
|
||||
|
||||
\Keybert\Program::main(\OptsLib\Parse::getArguments());
|
54
project.json
Normal file
54
project.json
Normal file
|
@ -0,0 +1,54 @@
|
|||
{
|
||||
"project": {
|
||||
"compiler": {
|
||||
"extension": "php",
|
||||
"minimum_version": "8.0",
|
||||
"maximum_version": "8.2"
|
||||
},
|
||||
"options": {
|
||||
"create_symlink": true
|
||||
}
|
||||
},
|
||||
"assembly": {
|
||||
"name": "Keybert",
|
||||
"package": "net.nosial.keybert",
|
||||
"version": "1.0.0",
|
||||
"uuid": "1695515c-2857-11ee-a7a6-6d740ea6cd07"
|
||||
},
|
||||
"execution_policies": [
|
||||
{
|
||||
"name": "main",
|
||||
"runner": "php",
|
||||
"execute": {
|
||||
"target": "main",
|
||||
"working_directory": "%CWD%",
|
||||
"tty": true
|
||||
}
|
||||
}
|
||||
],
|
||||
"build": {
|
||||
"source_path": "src",
|
||||
"default_configuration": "release",
|
||||
"main": "main",
|
||||
"dependencies": [
|
||||
{
|
||||
"name": "com.symfony.process",
|
||||
"version": "latest",
|
||||
"source_type": "remote",
|
||||
"source": "symfony/process=latest@composer"
|
||||
},
|
||||
{
|
||||
"name": "net.nosial.optslib",
|
||||
"version": "latest",
|
||||
"source_type": "remote",
|
||||
"source": "nosial/libs.opts=latest@n64"
|
||||
}
|
||||
],
|
||||
"configurations": [
|
||||
{
|
||||
"name": "release",
|
||||
"output_path": "build/release"
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
13
src/Keybert/Exceptions/KeybertException.php
Normal file
13
src/Keybert/Exceptions/KeybertException.php
Normal file
|
@ -0,0 +1,13 @@
|
|||
<?php
|
||||
|
||||
namespace Keybert\Exceptions;
|
||||
|
||||
use Throwable;
|
||||
|
||||
class KeybertException extends \Exception
|
||||
{
|
||||
public function __construct(string $message = "", int $code = 0, ?Throwable $previous = null)
|
||||
{
|
||||
parent::__construct($message, $code, $previous);
|
||||
}
|
||||
}
|
346
src/Keybert/Keybert.php
Normal file
346
src/Keybert/Keybert.php
Normal file
|
@ -0,0 +1,346 @@
|
|||
<?php
|
||||
|
||||
/** @noinspection PhpMissingFieldTypeInspection */
|
||||
|
||||
namespace Keybert;
|
||||
|
||||
use Exception;
|
||||
use JsonException;
|
||||
use Keybert\Exceptions\KeybertException;
|
||||
use RuntimeException;
|
||||
use Symfony\Component\Process\ExecutableFinder;
|
||||
use Symfony\Component\Process\Process;
|
||||
|
||||
class Keybert
|
||||
{
|
||||
/**
|
||||
* @var Process|null
|
||||
*/
|
||||
private $process;
|
||||
|
||||
/**
|
||||
* @var string|null
|
||||
*/
|
||||
private $host;
|
||||
|
||||
/**
|
||||
* @var int
|
||||
*/
|
||||
private $port;
|
||||
|
||||
/**
|
||||
* @var bool
|
||||
*/
|
||||
private $internal;
|
||||
|
||||
/**
|
||||
* @var array
|
||||
*/
|
||||
private $preload;
|
||||
|
||||
/**
|
||||
* @param string|null $host
|
||||
* @param int|null $port
|
||||
*/
|
||||
public function __construct(?array $preload=null, ?string $host=null, ?int $port=null)
|
||||
{
|
||||
$this->internal = ($host === null);
|
||||
$this->preload = $preload ?? [];
|
||||
|
||||
if(!$this->internal && $port === null)
|
||||
{
|
||||
throw new RuntimeException('If Keybert is running remotely, a port must be specified. (Host is not empty, but the port is.)');
|
||||
}
|
||||
|
||||
if($this->internal)
|
||||
{
|
||||
$this->host = '127.0.0.1';
|
||||
}
|
||||
else
|
||||
{
|
||||
$this->host = $host;
|
||||
}
|
||||
|
||||
$this->port = $port ?? self::getAvailablePort();
|
||||
|
||||
// Start the internal Keybert process if we're running locally.
|
||||
if($this->internal)
|
||||
{
|
||||
$this->start();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @param string $method
|
||||
* @param array|null $parameters
|
||||
* @return mixed
|
||||
* @throws KeybertException
|
||||
* @noinspection HttpUrlsUsage
|
||||
*/
|
||||
private function invoke(string $method, array $parameters = null): mixed
|
||||
{
|
||||
$ch = null;
|
||||
|
||||
try
|
||||
{
|
||||
$ch = curl_init(sprintf('http://%s:%d/%s', $this->host, $this->port, $method));
|
||||
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
|
||||
curl_setopt($ch, CURLOPT_POSTFIELDS, json_encode($parameters ?? [], JSON_THROW_ON_ERROR));
|
||||
curl_setopt($ch, CURLOPT_HTTPHEADER, [
|
||||
'Content-Type: application/json',
|
||||
]);
|
||||
|
||||
$result = curl_exec($ch);
|
||||
if($result === false)
|
||||
{
|
||||
throw new RuntimeException('Failed to invoke Keybert method "' . $method . '".');
|
||||
}
|
||||
$json_result = json_decode($result, true, 512, JSON_THROW_ON_ERROR);
|
||||
|
||||
$http_code = curl_getinfo($ch, CURLINFO_HTTP_CODE);
|
||||
|
||||
if ($http_code !== 200)
|
||||
{
|
||||
throw new RuntimeException('HTTP request failed with status code ' . $http_code);
|
||||
}
|
||||
}
|
||||
catch(Exception|JsonException $e)
|
||||
{
|
||||
throw new RuntimeException('Failed to invoke Keybert method "' . $method . '".', 0, $e);
|
||||
}
|
||||
finally
|
||||
{
|
||||
if ($ch)
|
||||
{
|
||||
curl_close($ch);
|
||||
}
|
||||
|
||||
$this->printUpdates();
|
||||
}
|
||||
|
||||
if(!isset($json_result['status']) || !$json_result['status'])
|
||||
{
|
||||
if(isset($json_result['message']))
|
||||
{
|
||||
throw new KeybertException($json_result['message']);
|
||||
}
|
||||
|
||||
throw new KeybertException(sprintf('Failed to invoke Keybert method "%s", %s.', $method, $result));
|
||||
}
|
||||
|
||||
return $json_result['data'];
|
||||
}
|
||||
|
||||
/**
|
||||
* Starts the internal Keybert process.
|
||||
*
|
||||
* @return void
|
||||
*/
|
||||
private function start(): void
|
||||
{
|
||||
if(!$this->internal)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
try
|
||||
{
|
||||
$this->process = new Process([
|
||||
(new ExecutableFinder())->find('python3'),
|
||||
__DIR__ . DIRECTORY_SEPARATOR . 'wrapper.py',
|
||||
]);
|
||||
|
||||
$this->process->setEnv([
|
||||
'KEYBERT_ENABLED' => '1',
|
||||
'KEYBERT_PORT' => $this->port,
|
||||
'KEYBERT_PRELOAD' => json_encode($this->preload, JSON_THROW_ON_ERROR),
|
||||
]);
|
||||
}
|
||||
catch(Exception $e)
|
||||
{
|
||||
throw new RuntimeException('Failed to start Keybert process.', 0, $e);
|
||||
}
|
||||
|
||||
// Start the process and wait for it to be ready. (2-Minutes timeout)
|
||||
$this->process->start();
|
||||
$start_time = time();
|
||||
|
||||
while($this->process->getExitCode() === null)
|
||||
{
|
||||
$this->printUpdates();
|
||||
|
||||
try
|
||||
{
|
||||
if($this->invoke('ping'))
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
catch(Exception $e)
|
||||
{
|
||||
if(time() - $start_time > 120)
|
||||
{
|
||||
throw new RuntimeException('Keybert wrapper process failed to start.', 0, $e);
|
||||
}
|
||||
|
||||
sleep(1);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
if($this->process->getExitCode() !== null)
|
||||
{
|
||||
throw new RuntimeException(sprintf('Keybert wrapper process exited with code %d.', $this->process->getExitCode()));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Stops the internal Keybert process.
|
||||
*
|
||||
* @return void
|
||||
*/
|
||||
private function stop()
|
||||
{
|
||||
if(!$this->process || !$this->process->isRunning())
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
$this->process->stop();
|
||||
$this->process = null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Prints out the updates from the internal Keybert process.
|
||||
*
|
||||
* @return void
|
||||
*/
|
||||
private function printUpdates(): void
|
||||
{
|
||||
if(!$this->internal || !$this->process)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
print($this->process->getIncrementalOutput());
|
||||
print($this->process->getIncrementalErrorOutput());
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns an available port in the given range.
|
||||
*
|
||||
* @param string $host
|
||||
* @param int $start
|
||||
* @param int $end
|
||||
* @return int
|
||||
*/
|
||||
private static function getAvailablePort(string $host='127.0.0.1', int $start=1024, int $end=65535): int
|
||||
{
|
||||
$range = range($start, $end);
|
||||
shuffle($range);
|
||||
foreach ($range as $port)
|
||||
{
|
||||
$connection = @stream_socket_client('tcp://' . $host . ':' . $port);
|
||||
if (is_resource($connection))
|
||||
{
|
||||
fclose($connection);
|
||||
}
|
||||
else
|
||||
{
|
||||
return $port;
|
||||
}
|
||||
}
|
||||
|
||||
throw new RuntimeException('No available port found in range ' . $start . ' to ' . $end . '.');
|
||||
}
|
||||
|
||||
/**
|
||||
* Runs the internal Keybert process and prints out the updates indefinitely.
|
||||
*
|
||||
* @return void
|
||||
*/
|
||||
public function run(): void
|
||||
{
|
||||
if(!$this->internal)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
while($this->process->isRunning())
|
||||
{
|
||||
$this->printUpdates();
|
||||
sleep(1);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Loads a model into Keybert.
|
||||
*
|
||||
* @param string $model The model to load, e.g. "paraphrase-multilingual-MiniLM-L12-v2"
|
||||
* @return void
|
||||
* @throws KeybertException
|
||||
*/
|
||||
public function loadModel(string $model): void
|
||||
{
|
||||
$this->invoke('load_model', [
|
||||
'model_name' => $model,
|
||||
]);
|
||||
}
|
||||
|
||||
/**
|
||||
* Extracts keywords from the given documents.
|
||||
*
|
||||
* @param string $model The model to use, e.g. "paraphrase-multilingual-MiniLM-L12-v2"
|
||||
* @param string|array $docs The documents or document to extract keywords from.
|
||||
* @param array|null $candidates The candidates to use for the keywords (eg; ["NOUN", "PROPN", "ADJ"])
|
||||
* @param array $keyphrase_ngram_range The ngram range to use for the keyphrases. (eg; [1, 2])
|
||||
* @param string $stop_words The stop words to use. (eg; "english")
|
||||
* @param int $top_n The number of keywords to return.
|
||||
* @param int $min_df The minimum document frequency.
|
||||
* @param bool $use_maxsum Whether to use the MaxSum algorithm.
|
||||
* @param bool $use_mmr Whether to use the Maximal Marginal Relevance algorithm.
|
||||
* @param float $diversity The diversity of the MMR algorithm.
|
||||
* @param int $nr_candidates The number of candidates to use for the MMR algorithm.
|
||||
* @param array|null $seed_keywords The seed keywords to use for the MMR algorithm.
|
||||
* @return array The extracted keywords.
|
||||
* @throws KeybertException If the extraction failed.
|
||||
*/
|
||||
public function extractKeywords(
|
||||
string $model,
|
||||
string|array $docs,
|
||||
array $candidates = null,
|
||||
array $keyphrase_ngram_range = [1, 1],
|
||||
string $stop_words = 'english',
|
||||
int $top_n = 5,
|
||||
int $min_df = 1,
|
||||
bool $use_maxsum = false,
|
||||
bool $use_mmr = false,
|
||||
float $diversity = 0.5,
|
||||
int $nr_candidates = 20,
|
||||
array $seed_keywords = null
|
||||
): array {
|
||||
return $this->invoke('extract_keywords', [
|
||||
'model_name' => $model,
|
||||
'docs' => $docs, // And this line
|
||||
'candidates' => $candidates,
|
||||
'keyphrase_ngram_range' => $keyphrase_ngram_range,
|
||||
'stop_words' => $stop_words,
|
||||
'top_n' => $top_n,
|
||||
'min_df' => $min_df,
|
||||
'use_maxsum' => $use_maxsum,
|
||||
'use_mmr' => $use_mmr,
|
||||
'diversity' => $diversity,
|
||||
'nr_candidates' => $nr_candidates,
|
||||
'seed_keywords' => $seed_keywords,
|
||||
]);
|
||||
}
|
||||
|
||||
/**
|
||||
* Destructor.
|
||||
*/
|
||||
public function __destruct()
|
||||
{
|
||||
$this->stop();
|
||||
}
|
||||
|
||||
}
|
59
src/Keybert/Program.php
Normal file
59
src/Keybert/Program.php
Normal file
|
@ -0,0 +1,59 @@
|
|||
<?php
|
||||
|
||||
namespace Keybert;
|
||||
|
||||
use ncc\Runtime;
|
||||
|
||||
class Program
|
||||
{
|
||||
/**
|
||||
* Main entry point for the CLI
|
||||
*
|
||||
* @param array $args
|
||||
* @return void
|
||||
*/
|
||||
public static function main(array $args=[]): void
|
||||
{
|
||||
if(getenv('KEYBERT_PORT') === false)
|
||||
{
|
||||
$port = $args['port'] ?? $args['p'] ?? null;
|
||||
}
|
||||
else
|
||||
{
|
||||
$port = getenv('KEYBERT_PORT');
|
||||
}
|
||||
|
||||
if(getenv('KEYBERT_PRELOAD') === false)
|
||||
{
|
||||
$preload = $args['preload'] ?? $args['l'] ?? null;
|
||||
}
|
||||
else
|
||||
{
|
||||
$preload = getenv('KEYBERT_PRELOAD');
|
||||
}
|
||||
|
||||
$keybert = new Keybert($preload, null, $port);
|
||||
$keybert->run();
|
||||
}
|
||||
|
||||
/**
|
||||
* Displays the help message
|
||||
*
|
||||
* @return void
|
||||
*/
|
||||
public static function help(): void
|
||||
{
|
||||
print('Keybert v' . Runtime::getConstant('net.nosial.federationlib', 'version') . PHP_EOL . PHP_EOL);
|
||||
|
||||
print('Usage: Keybert [options]' . PHP_EOL);
|
||||
print('Options:' . PHP_EOL);
|
||||
print(' --port|-p - The port to listen on' . PHP_EOL);
|
||||
print(' --preload|-l - Optional. Models to preload, seperated by commas' . PHP_EOL);
|
||||
print(PHP_EOL);
|
||||
print('Environment Variables:' . PHP_EOL);
|
||||
print(' KEYBERT_PORT - The port to listen on' . PHP_EOL);
|
||||
print(' KEYBERT_PRELOAD - Optional. Models to preload, seperated by commas' . PHP_EOL);
|
||||
|
||||
exit(0);
|
||||
}
|
||||
}
|
97
src/Keybert/wrapper.py
Normal file
97
src/Keybert/wrapper.py
Normal file
|
@ -0,0 +1,97 @@
|
|||
from aiohttp import web
|
||||
from keybert import KeyBERT
|
||||
import json
|
||||
import os
|
||||
|
||||
app = web.Application()
|
||||
models = {}
|
||||
|
||||
|
||||
# Ping
|
||||
async def ping(request):
|
||||
return web.json_response({'status': True, 'message': None, 'data': True})
|
||||
|
||||
|
||||
# Load a model
|
||||
async def load_model(request):
|
||||
try:
|
||||
data = await request.json()
|
||||
model_name = data.get('model_name', '')
|
||||
if model_name:
|
||||
models[model_name] = KeyBERT(model=model_name)
|
||||
return web.json_response({'status': True, 'message': None, 'data': f'Model {model_name} loaded.'})
|
||||
else:
|
||||
return web.json_response({'status': False, 'message': 'No model name provided.', 'data': None})
|
||||
except Exception as e:
|
||||
return web.json_response({'status': False, 'message': str(e), 'data': None})
|
||||
|
||||
|
||||
# Extract keywords
|
||||
async def extract_keywords(request):
|
||||
try:
|
||||
data = await request.json()
|
||||
model_name = data.get('model_name', '')
|
||||
docs = data.get('docs', [])
|
||||
candidates = data.get('candidates', None)
|
||||
keyphrase_ngram_range = tuple(data.get('keyphrase_ngram_range', (1, 1)))
|
||||
stop_words = data.get('stop_words', 'english')
|
||||
top_n = data.get('top_n', 5)
|
||||
min_df = data.get('min_df', 1)
|
||||
use_maxsum = data.get('use_maxsum', False)
|
||||
use_mmr = data.get('use_mmr', False)
|
||||
diversity = data.get('diversity', 0.5)
|
||||
nr_candidates = data.get('nr_candidates', 20)
|
||||
seed_keywords = data.get('seed_keywords', None)
|
||||
|
||||
# Check if the model is loaded, if not, load it
|
||||
if model_name not in models:
|
||||
models[model_name] = KeyBERT(model=model_name)
|
||||
|
||||
if docs:
|
||||
keywords = models[model_name].extract_keywords(
|
||||
docs,
|
||||
candidates=candidates,
|
||||
keyphrase_ngram_range=keyphrase_ngram_range,
|
||||
stop_words=stop_words,
|
||||
top_n=top_n,
|
||||
min_df=min_df,
|
||||
use_maxsum=use_maxsum,
|
||||
use_mmr=use_mmr,
|
||||
diversity=diversity,
|
||||
nr_candidates=nr_candidates,
|
||||
seed_keywords=seed_keywords,
|
||||
)
|
||||
# Transform the result from a list of tuples to a dictionary
|
||||
keywords_dict = {word: score for word, score in keywords}
|
||||
return web.json_response({'status': True, 'message': None, 'data': keywords_dict})
|
||||
else:
|
||||
return web.json_response(
|
||||
{'status': False, 'message': 'No document provided.', 'data': None})
|
||||
except Exception as e:
|
||||
return web.json_response({'status': False, 'message': str(e), 'data': None})
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
# Check for required environment variables
|
||||
if 'KEYBERT_ENABLED' not in os.environ or os.environ['KEYBERT_ENABLED'] != '1':
|
||||
print('Error: KEYBERT_ENABLED environment variable not set or not "true".')
|
||||
exit(1)
|
||||
|
||||
if 'KEYBERT_PORT' not in os.environ:
|
||||
print('Error: KEYBERT_PORT environment variable not set.')
|
||||
exit(1)
|
||||
|
||||
# Preload models if KEYBERT_PRELOAD is set
|
||||
if 'KEYBERT_PRELOAD' in os.environ:
|
||||
preload_models = json.loads(os.environ['KEYBERT_PRELOAD'])
|
||||
for model_name in preload_models:
|
||||
models[model_name] = KeyBERT(model=model_name)
|
||||
print(f'Model {model_name} preloaded.')
|
||||
|
||||
# Add routes
|
||||
app.router.add_post('/ping', ping)
|
||||
app.router.add_post('/load_model', load_model)
|
||||
app.router.add_post('/extract_keywords', extract_keywords)
|
||||
|
||||
# Start server
|
||||
web.run_app(app, host='0.0.0.0', port=int(os.environ['KEYBERT_PORT']))
|
11
tests/model_test.php
Normal file
11
tests/model_test.php
Normal file
|
@ -0,0 +1,11 @@
|
|||
<?php
|
||||
|
||||
require 'ncc';
|
||||
import('net.nosial.keybert');
|
||||
|
||||
$keybert = new \Keybert\Keybert(null, '127.0.0.1', 1241);
|
||||
|
||||
$document = 'The history of natural language processing (NLP) generally started in the 1950s, although work can be found from earlier periods. In 1950, Alan Turing published an article titled "Computing Machinery and Intelligence" which proposed what is now called the Turing test as a criterion of intelligence.';
|
||||
$keywords = $keybert->extractKeywords('paraphrase-multilingual-MiniLM-L12-v2', $document);
|
||||
|
||||
var_dump($keywords);
|
23
tests/wrapper.http
Normal file
23
tests/wrapper.http
Normal file
|
@ -0,0 +1,23 @@
|
|||
###
|
||||
POST http://0.0.0.0:5000/load_model
|
||||
Content-Type: application/json
|
||||
|
||||
{
|
||||
"model_name": "paraphrase-multilingual-mpnet-base-v2"
|
||||
}
|
||||
|
||||
###
|
||||
POST http://0.0.0.0:5000/extract_keywords
|
||||
Content-Type: application/json
|
||||
|
||||
{
|
||||
"model_name": "paraphrase-multilingual-mpnet-base-v2",
|
||||
"docs": "The history of natural language processing (NLP) generally started in the 1950s, although work can be found from earlier periods. In 1950, Alan Turing published an article titled \"Computing Machinery and Intelligence\" which proposed what is now called the Turing test as a criterion of intelligence.",
|
||||
"keyphrase_ngram_range": [1, 1],
|
||||
"top_n": 5,
|
||||
"min_df": 1,
|
||||
"use_maxsum": false,
|
||||
"use_mmr": false,
|
||||
"diversity": 0.5,
|
||||
"nr_candidates": 20
|
||||
}
|
Loading…
Add table
Reference in a new issue