Skip to content

Commit 1de81b1

Browse files
committed
FEATURE: Define max file size for ingesting attachments
Huge files may exceed the avalaible memory and should thus not be ingested.
1 parent 44c2fea commit 1de81b1

2 files changed

Lines changed: 35 additions & 12 deletions

File tree

Classes/AssetExtraction/IngestAttachmentAssetExtractor.php

Lines changed: 32 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,12 @@ class IngestAttachmentAssetExtractor implements AssetExtractorInterface
4646
*/
4747
protected $logger;
4848

49+
/**
50+
* @Flow\InjectConfiguration(package="Flowpack.ElasticSearch.ContentRepositoryAdaptor", path="indexing.assetExtraction.maximumFileSize")
51+
* @var int
52+
*/
53+
protected $maximumFileSize;
54+
4955
/**
5056
* Takes an asset and extracts content and meta data.
5157
*
@@ -57,6 +63,11 @@ class IngestAttachmentAssetExtractor implements AssetExtractorInterface
5763
*/
5864
public function extract(AssetInterface $asset): AssetContent
5965
{
66+
if ($asset->getResource()->getFileSize() > $this->maximumFileSize) {
67+
$this->logger->info(sprintf('The asset %s with size of %s bytes exceeds the maximum size of %s bytes. The file content was not ingested.', $asset->getResource()->getFilename(), $asset->getResource()->getFileSize(), $this->maximumFileSize), LogEnvironment::fromMethodName(__METHOD__));
68+
return $this->buildAssetContentObject([]);
69+
}
70+
6071
$extractedAsset = null;
6172

6273
$request = [
@@ -88,22 +99,12 @@ public function extract(AssetInterface $asset): AssetContent
8899
}
89100

90101
if (!is_array($extractedAsset)) {
91-
$this->logger->error(sprintf('Error while extracting fulltext data from file "%s". See Elasticsearch error log line fo details.', $asset->getResource()->getFilename()), LogEnvironment::fromMethodName(__METHOD__));
102+
$this->logger->error(sprintf('Error while extracting fulltext data from file "%s". See Elasticsearch error log line for details.', $asset->getResource()->getFilename()), LogEnvironment::fromMethodName(__METHOD__));
92103
} else {
93104
$this->logger->debug(sprintf('Extracted asset %s of type %s. Extracted %s characters of content', $asset->getResource()->getFilename(), $extractedAsset['content_type'] ?? '-no-content-type-', $extractedAsset['content_length'] ?? '0'), LogEnvironment::fromMethodName(__METHOD__));
94105
}
95106

96-
return new AssetContent(
97-
$extractedAsset['content'] ?? '',
98-
$extractedAsset['title'] ?? '',
99-
$extractedAsset['name'] ?? '',
100-
$extractedAsset['author'] ?? '',
101-
$extractedAsset['keywords'] ?? '',
102-
$extractedAsset['date'] ?? '',
103-
$extractedAsset['content_type'] ?? '',
104-
$extractedAsset['content_length'] ?? 0,
105-
$extractedAsset['language'] ?? ''
106-
);
107+
return $this->buildAssetContentObject($extractedAsset);
107108
}
108109

109110
/**
@@ -129,4 +130,23 @@ protected function getAssetContent(AssetInterface $asset): string
129130
$result = stream_get_contents($stream);
130131
return $result !== false ? $result : '';
131132
}
133+
134+
/**
135+
* @param $extractedAsset
136+
* @return AssetContent
137+
*/
138+
protected function buildAssetContentObject(array $extractedAsset): AssetContent
139+
{
140+
return new AssetContent(
141+
$extractedAsset['content'] ?? '',
142+
$extractedAsset['title'] ?? '',
143+
$extractedAsset['name'] ?? '',
144+
$extractedAsset['author'] ?? '',
145+
$extractedAsset['keywords'] ?? '',
146+
$extractedAsset['date'] ?? '',
147+
$extractedAsset['content_type'] ?? '',
148+
$extractedAsset['content_length'] ?? 0,
149+
$extractedAsset['language'] ?? ''
150+
);
151+
}
132152
}

Configuration/Settings.yaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,9 @@ Flowpack:
77
batchSize:
88
elements: 500
99
octets: 40000000
10+
assetExtraction:
11+
# The maximum size of files to be ingested in bytes (100 Mb)
12+
maximumFileSize: 104857600
1013
configuration:
1114
nodeTypes:
1215
'*':

0 commit comments

Comments
 (0)