Skip to content

Commit 143b3df

Browse files
authored
Add GUC 'gp_random_insert_segments' to control the segments used for random distributed table insertion (#406)
Introduces the 'gp_random_insert_segments' GUC to reduce the generation of excessive fragmented files during the insertion of small amounts of data into clusters with a large number of segments (e.g., 1000 records into 100 segments). Fragmented data insertion can significantly degrade performance, especially when using append-optimized or cloud-based storage. By introducing the 'gp_random_insert_segments' GUC, users can limit the number of segments used for data insertion in randomly distributed tables, which can significantly reduce fragmented files.
1 parent d189fd0 commit 143b3df

7 files changed

Lines changed: 49 additions & 0 deletions

File tree

src/backend/cdb/cdbllize.c

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1332,6 +1332,20 @@ build_slice_table_walker(Node *node, build_slice_table_context *context)
13321332
sendSlice->directDispatch.contentIds = list_make1_int(0);
13331333
}
13341334

1335+
if (root->parse->commandType == CMD_INSERT &&
1336+
motion->motionType == MOTIONTYPE_HASH &&
1337+
motion->plan.locustype == CdbLocusType_Strewn &&
1338+
motion->numHashSegments == gp_random_insert_segments)
1339+
{
1340+
PlanSlice *recvSlice;
1341+
/*
1342+
* Using limited segments for random distributed data insertion, we
1343+
* just enable limited segments to do actual works.
1344+
*/
1345+
recvSlice = (PlanSlice *) list_nth(context->slices, sendSlice->parentIndex);
1346+
recvSlice->numsegments = motion->numHashSegments;
1347+
}
1348+
13351349
result = plan_tree_walker((Node *) motion,
13361350
build_slice_table_walker,
13371351
context,

src/backend/cdb/cdbpath.c

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2611,6 +2611,15 @@ create_motion_path_for_insert(PlannerInfo *root, GpPolicy *policy,
26112611
}
26122612
else
26132613
elog(ERROR, "unrecognized policy type %u", policyType);
2614+
2615+
if (CdbPathLocus_IsStrewn(subpath->locus) && subpath->locus.distkey == NIL &&
2616+
gp_random_insert_segments > 0 &&
2617+
gp_random_insert_segments < CdbPathLocus_NumSegments(subpath->locus))
2618+
{
2619+
/* Select limited random segments for data insertion. */
2620+
subpath->locus.numsegments = gp_random_insert_segments;
2621+
}
2622+
26142623
return subpath;
26152624
}
26162625

src/backend/commands/copyfrom.c

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3324,6 +3324,12 @@ GetTargetSeg(GpDistributionData *distData, TupleTableSlot *slot)
33243324

33253325
target_seg = cdbhashreduce(cdbHash); /* hash result segment */
33263326
}
3327+
else if (gp_random_insert_segments > 0 &&
3328+
gp_random_insert_segments < policy->numsegments)
3329+
{
3330+
/* Select limited random segments for data insertion. */
3331+
target_seg = cdbhashrandomseg(gp_random_insert_segments);
3332+
}
33273333
else
33283334
{
33293335
/*

src/backend/gpopt/translate/CTranslatorQueryToDXL.cpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ extern "C" {
2424
#include "nodes/parsenodes.h"
2525
#include "nodes/plannodes.h"
2626
#include "optimizer/walkers.h"
27+
#include "utils/guc.h"
2728
#include "utils/rel.h"
2829
}
2930

@@ -736,6 +737,12 @@ CTranslatorQueryToDXL::TranslateInsertQueryToDXL()
736737
GPOS_WSZ_LIT("DML not enabled"));
737738
}
738739

740+
if (gp_random_insert_segments > 0)
741+
{
742+
GPOS_RAISE(gpdxl::ExmaDXL, gpdxl::ExmiQuery2DXLUnsupportedFeature,
743+
GPOS_WSZ_LIT("limited insert segments not supported"));
744+
}
745+
739746
CDXLNode *query_dxlnode = TranslateSelectQueryToDXL();
740747
const RangeTblEntry *rte = (RangeTblEntry *) gpdb::ListNth(
741748
m_query->rtable, m_query->resultRelation - 1);

src/backend/utils/misc/guc_gp.c

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -146,6 +146,7 @@ int gp_appendonly_compaction_threshold = 0;
146146
bool enable_parallel = false;
147147
int gp_appendonly_insert_files = 0;
148148
int gp_appendonly_insert_files_tuples_range = 0;
149+
int gp_random_insert_segments = 0;
149150
bool gp_heap_require_relhasoids_match = true;
150151
bool gp_local_distributed_cache_stats = false;
151152
bool debug_xlog_record_read = false;
@@ -3215,6 +3216,16 @@ struct config_int ConfigureNamesInt_gp[] =
32153216
NULL, NULL, NULL
32163217
},
32173218

3219+
{
3220+
{"gp_random_insert_segments", PGC_USERSET, CUSTOM_OPTIONS,
3221+
gettext_noop("Use limited number of segments for random distributed table insertion."),
3222+
NULL
3223+
},
3224+
&gp_random_insert_segments,
3225+
0, 0, INT_MAX,
3226+
NULL, NULL, NULL
3227+
},
3228+
32183229
{
32193230
{"gp_workfile_max_entries", PGC_POSTMASTER, RESOURCES,
32203231
gettext_noop("Sets the maximum number of entries that can be stored in the workfile directory"),

src/include/utils/guc.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -300,6 +300,7 @@ extern bool gp_appendonly_compaction;
300300
extern bool enable_parallel;
301301
extern int gp_appendonly_insert_files;
302302
extern int gp_appendonly_insert_files_tuples_range;
303+
extern int gp_random_insert_segments;
303304
extern bool enable_answer_query_using_materialized_views;
304305
extern bool enable_offload_entry_to_qe;
305306
/*

src/include/utils/sync_guc_name.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -152,3 +152,4 @@
152152
"gp_resgroup_debug_wait_queue",
153153
"gp_appendonly_insert_files",
154154
"gp_appendonly_insert_files_tuples_range",
155+
"gp_random_insert_segments",

0 commit comments

Comments
 (0)