Skip to content

Commit d79b7b6

Browse files
authored
Merge branch 'master' into fulltext-search-with-simple-query-string
2 parents 37aab00 + 307b252 commit d79b7b6

36 files changed

+634
-306
lines changed

.travis.yml

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9,9 +9,9 @@ addons:
99
- openjdk-8-jre-headless
1010
matrix:
1111
include:
12-
- php: 7.2
12+
- php: 7.3
1313
env: ES=6
14-
- php: 7.2
14+
- php: 7.3
1515
env: ES=7
1616
- php: 7.4
1717
env: ES=6
@@ -23,7 +23,7 @@ cache:
2323
- $HOME/.composer/cache
2424

2525
before_install:
26-
- export NEOS_TARGET_VERSION=5.3
26+
- export NEOS_TARGET_VERSION=7.0
2727
- cd ..
2828
- if [ "$ES" = 6 ]; then wget --no-check-certificate https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-6.8.6.tar.gz && tar xvfz elasticsearch-6.8.6.tar.gz && mv elasticsearch-6.8.6 elasticsearch; fi
2929
- if [ "$ES" = 7 ]; then wget --no-check-certificate https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.9.0-linux-x86_64.tar.gz && tar xvfz elasticsearch-7.9.0-linux-x86_64.tar.gz && mv elasticsearch-7.9.0 elasticsearch; fi
@@ -35,7 +35,6 @@ before_install:
3535
- composer require --no-update --no-interaction neos/content-repository-search:dev-master
3636
- composer require --no-update --no-interaction flowpack/elasticsearch:dev-master
3737
- composer require --no-update --no-interaction flowpack/elasticsearch-contentrepositoryadaptor:dev-master
38-
- composer remove --no-update --no-interaction neos/site-kickstarter
3938
install:
4039
- composer install --no-interaction
4140
- cd ..

Classes/AssetExtraction/IngestAttachmentAssetExtractor.php

Lines changed: 59 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -13,10 +13,11 @@
1313
* source code.
1414
*/
1515

16-
use Neos\Flow\Annotations as FLow;
16+
use Neos\Flow\Annotations as Flow;
1717
use Neos\ContentRepository\Search\AssetExtraction\AssetExtractorInterface;
1818
use Neos\ContentRepository\Search\Dto\AssetContent;
1919
use Flowpack\ElasticSearch\ContentRepositoryAdaptor\ElasticSearchClient;
20+
use Neos\Flow\Log\ThrowableStorageInterface;
2021
use Neos\Flow\Log\Utility\LogEnvironment;
2122
use Neos\Media\Domain\Model\AssetInterface;
2223
use Neos\Utility\Arrays;
@@ -33,12 +34,24 @@ class IngestAttachmentAssetExtractor implements AssetExtractorInterface
3334
*/
3435
protected $elasticsearchClient;
3536

37+
/**
38+
* @Flow\Inject
39+
* @var ThrowableStorageInterface
40+
*/
41+
protected $throwableStorage;
42+
3643
/**
3744
* @Flow\Inject
3845
* @var LoggerInterface
3946
*/
4047
protected $logger;
4148

49+
/**
50+
* @Flow\InjectConfiguration(package="Flowpack.ElasticSearch.ContentRepositoryAdaptor", path="indexing.assetExtraction.maximumFileSize")
51+
* @var int
52+
*/
53+
protected $maximumFileSize;
54+
4255
/**
4356
* Takes an asset and extracts content and meta data.
4457
*
@@ -50,6 +63,13 @@ class IngestAttachmentAssetExtractor implements AssetExtractorInterface
5063
*/
5164
public function extract(AssetInterface $asset): AssetContent
5265
{
66+
if ($asset->getResource()->getFileSize() > $this->maximumFileSize) {
67+
$this->logger->info(sprintf('The asset %s with size of %s bytes exceeds the maximum size of %s bytes. The file content was not ingested.', $asset->getResource()->getFilename(), $asset->getResource()->getFileSize(), $this->maximumFileSize), LogEnvironment::fromMethodName(__METHOD__));
68+
return $this->buildAssetContentObject([]);
69+
}
70+
71+
$extractedAsset = null;
72+
5373
$request = [
5474
'pipeline' => [
5575
'description' => 'Attachment Extraction',
@@ -73,14 +93,50 @@ public function extract(AssetInterface $asset): AssetContent
7393
];
7494

7595
$result = $this->elasticsearchClient->request('POST', '_ingest/pipeline/_simulate', [], json_encode($request))->getTreatedContent();
76-
$extractedAsset = Arrays::getValueByPath($result, 'docs.0.doc._source.attachment');
96+
97+
if (is_array($result)) {
98+
$extractedAsset = Arrays::getValueByPath($result, 'docs.0.doc._source.attachment');
99+
}
77100

78101
if (!is_array($extractedAsset)) {
79-
$this->logger->error(sprintf('Error while extracting fulltext data from file "%s". See Elasticsearch error log line fo details.', $asset->getResource()->getFilename()), LogEnvironment::fromMethodName(__METHOD__));
102+
$this->logger->error(sprintf('Error while extracting fulltext data from file "%s". See Elasticsearch error log line for details.', $asset->getResource()->getFilename()), LogEnvironment::fromMethodName(__METHOD__));
80103
} else {
81104
$this->logger->debug(sprintf('Extracted asset %s of type %s. Extracted %s characters of content', $asset->getResource()->getFilename(), $extractedAsset['content_type'] ?? '-no-content-type-', $extractedAsset['content_length'] ?? '0'), LogEnvironment::fromMethodName(__METHOD__));
82105
}
83106

107+
return $this->buildAssetContentObject($extractedAsset);
108+
}
109+
110+
/**
111+
* @param AssetInterface $asset
112+
* @return string
113+
*/
114+
protected function getAssetContent(AssetInterface $asset): string
115+
{
116+
try {
117+
$stream = $asset->getResource()->getStream();
118+
} catch (\Exception $e) {
119+
$message = $this->throwableStorage->logThrowable($e);
120+
$this->logger->error(sprintf('An exception occured while fetching resource with sha1 %s of asset %s. %s', $asset->getResource()->getSha1(), $asset->getResource()->getFilename(), $message), LogEnvironment::fromMethodName(__METHOD__));
121+
return '';
122+
}
123+
124+
if ($stream === false) {
125+
$this->logger->error(sprintf('Could not get the file stream of resource with sha1 %s of asset %s.', $asset->getResource()->getSha1(), $asset->getResource()->getFilename()), LogEnvironment::fromMethodName(__METHOD__));
126+
return '';
127+
}
128+
129+
stream_filter_append($stream, 'convert.base64-encode');
130+
$result = stream_get_contents($stream);
131+
return $result !== false ? $result : '';
132+
}
133+
134+
/**
135+
* @param $extractedAsset
136+
* @return AssetContent
137+
*/
138+
protected function buildAssetContentObject(?array $extractedAsset): AssetContent
139+
{
84140
return new AssetContent(
85141
$extractedAsset['content'] ?? '',
86142
$extractedAsset['title'] ?? '',
@@ -93,16 +149,4 @@ public function extract(AssetInterface $asset): AssetContent
93149
$extractedAsset['language'] ?? ''
94150
);
95151
}
96-
97-
/**
98-
* @param AssetInterface $asset
99-
* @return null|string
100-
*/
101-
protected function getAssetContent(AssetInterface $asset): ?string
102-
{
103-
$stream = $asset->getResource()->getStream();
104-
stream_filter_append($stream, 'convert.base64-encode');
105-
$result = stream_get_contents($stream);
106-
return $result !== false ? $result : null;
107-
}
108152
}

0 commit comments

Comments
 (0)