diff --git a/CHANGELOG.md b/CHANGELOG.md index 7a1ce11..5b44b41 100755 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,12 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [1.0.1] - 2023-07-29 + +### Fixed + * Fixed issue where multi-document queries would throw an unexpected exception due to the way tupals are handled in the + in the wrapper class. + ## [1.0.0] - 2023-07-23 diff --git a/project.json b/project.json index ca5414c..8d2252e 100644 --- a/project.json +++ b/project.json @@ -12,7 +12,7 @@ "assembly": { "name": "Keybert", "package": "net.nosial.keybert", - "version": "1.0.0", + "version": "1.0.1", "uuid": "1695515c-2857-11ee-a7a6-6d740ea6cd07" }, "execution_policies": [ diff --git a/src/Keybert/Keybert.php b/src/Keybert/Keybert.php index 076c35b..e99aac1 100644 --- a/src/Keybert/Keybert.php +++ b/src/Keybert/Keybert.php @@ -39,6 +39,7 @@ private $preload; /** + * @param array|null $preload * @param string|null $host * @param int|null $port */ @@ -319,7 +320,7 @@ int $nr_candidates = 20, array $seed_keywords = null ): array { - return $this->invoke('extract_keywords', [ + $results = $this->invoke('extract_keywords', [ 'model_name' => $model, 'docs' => $docs, // And this line 'candidates' => $candidates, @@ -333,6 +334,55 @@ 'nr_candidates' => $nr_candidates, 'seed_keywords' => $seed_keywords, ]); + + return $this->parseResults($results); + } + + /** + * Parses the results from the internal Keybert process. + * + * @param $results + * @return array + */ + private function parseResults($results): array + { + if (is_array($results)) + { + // check if the results are multi or single + if(is_array($results[0][0])) + { + // This is a multi-results + $parsedResults = []; + foreach($results as $result) + { + $parsedResults[] = $this->parseSingleResult($result); + } + } + else + { + // This is a single-result + $parsedResults = $this->parseSingleResult($results); + } + } + + return $parsedResults ?? []; + } + + /** + * Parses a single result from the internal Keybert process. + * + * @param $result + * @return array + */ + private function parseSingleResult($result): array + { + $parsedResult = []; + foreach($result as $keywordScore) + { + $parsedResult[$keywordScore[0]] = $keywordScore[1]; + } + + return $parsedResult; } /** diff --git a/src/Keybert/wrapper.py b/src/Keybert/wrapper.py index 990b850..3480d76 100644 --- a/src/Keybert/wrapper.py +++ b/src/Keybert/wrapper.py @@ -48,7 +48,7 @@ async def extract_keywords(request): models[model_name] = KeyBERT(model=model_name) if docs: - keywords = models[model_name].extract_keywords( + all_keywords = models[model_name].extract_keywords( docs, candidates=candidates, keyphrase_ngram_range=keyphrase_ngram_range, @@ -61,9 +61,8 @@ async def extract_keywords(request): nr_candidates=nr_candidates, seed_keywords=seed_keywords, ) - # Transform the result from a list of tuples to a dictionary - keywords_dict = {word: score for word, score in keywords} - return web.json_response({'status': True, 'message': None, 'data': keywords_dict}) + return web.json_response({'status': True, 'message': None, 'data': all_keywords}) + else: return web.json_response( {'status': False, 'message': 'No document provided.', 'data': None}) diff --git a/tests/model_test.php b/tests/model_test.php index b8a0d15..db3e3e1 100644 --- a/tests/model_test.php +++ b/tests/model_test.php @@ -3,9 +3,39 @@ require 'ncc'; import('net.nosial.keybert'); - $keybert = new \Keybert\Keybert(null, '127.0.0.1', 2131); + //$keybert = new \Keybert\Keybert(null, 'power.chan.int.n64.cc', 2131); - $document = 'The history of natural language processing (NLP) generally started in the 1950s, although work can be found from earlier periods. In 1950, Alan Turing published an article titled "Computing Machinery and Intelligence" which proposed what is now called the Turing test as a criterion of intelligence.'; - $keywords = $keybert->extractKeywords('paraphrase-multilingual-MiniLM-L12-v2', $document); + $documents = array( + "The quick brown fox jumps over the lazy dog", + "Lorem ipsum dolor sit amet, consectetur adipiscing elit", + "PHP is a popular general-purpose scripting language", + "Machine Learning provides computers the ability to learn without being explicitly programmed", + "OpenAI is an artificial intelligence research lab", + "Blockchain is a decentralized ledger of all transactions across a peer-to-peer network", + "Python is a popular language for data science", + "Artificial Intelligence is a branch of computer science that aims to create intelligent machines", + "Big data is a term that describes the large volume of data – both structured and unstructured – that inundates a business on a day-to-day basis", + "Quantum computing is the use of quantum-mechanical phenomena such as superposition and entanglement to perform computation" + ); + + $keybert = new \Keybert\Keybert(); + $multi_start = microtime(true); + $keywords = $keybert->extractKeywords('paraphrase-multilingual-MiniLM-L12-v2', $documents); + $multi_end = microtime(true); + + unset($keybert); + $keybert = new \Keybert\Keybert(); + $single_start = microtime(true); + foreach($documents as $document) + { + var_dump($keybert->extractKeywords('paraphrase-multilingual-MiniLM-L12-v2', $document)); + } + $single_end = microtime(true); + + echo "Multi: " . ($multi_end - $multi_start) . PHP_EOL; + echo "Single: " . ($single_end - $single_start) . PHP_EOL; + + // Which is faster? Multi or single? + echo "Faster is: " . (($multi_end - $multi_start) < ($single_end - $single_start) ? "Multi" : "Single") . PHP_EOL; var_dump($keywords); \ No newline at end of file