PostgreSQL database statistics - analyze execute function

The sampling function needs to be determined before executing the do_analyze_rel function, as follows:

If it is a normal table or materialized view, the sampling function uses acquire_sample_rows; relpages is provided by the RelationGetNumberOfBlocks(onerel) function
If it is an external table, you need to use the hook provided by the external table, such as postgresAnalyzeForeignTable of postgres_fdw; relpages is also provided by the external table hook of fdwroutine->AnalyzeForeignTable
If it is an inherited table, the sampling function uses acquire_inherited_sample_rows

The do_analyze_rel function is mainly divided into three processes: preparation work, sampling work, and update statistical information process.

The preparation workflow of the do_analyze_rel function is as follows:

Create anl_context memory context and switch to this memory context
Switch to the userid of the table owner so that any indexing functions run as that user.
It is necessary to further determine which columns in a table are to be analyzed: the user may specify that only certain columns in the table are to be analyzed, and the frequently accessed columns are more valuable to be analyzed, and then open all indexes of the table to be analyzed, see See if there are any columns that can be analyzed.

static void do_analyze_rel(Relation onerel, VacuumParams *params, List *va_cols, AcquireSampleRowsFunc acquirefunc, BlockNumber relpages, bool inh, bool in_outer_xact, int elevel) {  int      attr_cnt, tcnt, i, ind, nindexes, targrows, numrows, save_sec_context, save_nestlevel;  double    totalrows, totaldeadrows;  Relation   *Irel;  bool    hasindex;  VacAttrStats **vacattrstats;  AnlIndexData *indexdata;    HeapTuple  *rows;  PGRUsage  ru0;  TimestampTz starttime = 0;  MemoryContext caller_context;  Oid      save_userid;    anl_context = AllocSetContextCreate(CurrentMemoryContext,"Analyze",ALLOCSET_DEFAULT_SIZES); /* Set up a working context so that we can easily free whatever junk gets created. */  caller_context = MemoryContextSwitchTo(anl_context);    GetUserIdAndSecContext(&save_userid, &save_sec_context); /* Switch to the table owner's userid, so that any index functions are run as that user.  Also lock down security-restricted operations and arrange to make GUC variable changes local to this command. */  SetUserIdAndSecContext(onerel->rd_rel->relowner, save_sec_context | SECURITY_RESTRICTED_OPERATION);  save_nestlevel = NewGUCNestLevel();    if (IsAutoVacuumWorkerProcess() && params->log_min_duration >= 0) { /* measure elapsed time iff autovacuum logging requires it */    pg_rusage_init(&ru0);    if (params->log_min_duration > 0) starttime = GetCurrentTimestamp();  }
  /* Determine which columns to analyze Note that system attributes are never analyzed, so we just reject them at the lookup stage.  We also reject duplicate column mentions.  (We could alternatively ignore duplicates, but analyzing a column twice won't work; we'd end up making a conflicting update in pg_statistic.) */ // 确定要分析的列。请注意，系统属性从未被分析，因此我们只是在查找阶段拒绝它们。我们还拒绝重复提到的列。（我们也可以忽略重复项，但两次分析一列是行不通的；我们最终会在pg_statistic中进行冲突更新。）  if (va_cols != NIL) {    Bitmapset  *unique_cols = NULL;    ListCell   *le; tcnt = 0;    vacattrstats = (VacAttrStats **) palloc(list_length(va_cols) * sizeof(VacAttrStats *)); // 创建指针数组用于存放VacAttrStats指针        foreach(le, va_cols) {      char     *col = strVal(lfirst(le));      i = attnameAttNum(onerel, col, false);      if (i == InvalidAttrNumber) ereport(ERROR,(errcode(ERRCODE_UNDEFINED_COLUMN), errmsg("column \"%s\" of relation \"%s\" does not exist", col, RelationGetRelationName(onerel)))); // 表中不存在这样的列      if (bms_is_member(i, unique_cols)) ereport(ERROR, (errcode(ERRCODE_DUPLICATE_COLUMN), errmsg("column \"%s\" of relation \"%s\" appears more than once", col, RelationGetRelationName(onerel)))); // bitmap set中已经存在这样的列      unique_cols = bms_add_member(unique_cols, i); // 将该列号存放到bitmap set中      vacattrstats[tcnt] = examine_attribute(onerel, i, NULL);       // examine_attribute函数的作用      // 检测该列是否能够支持分析（dropped的列【已删除的字段还存在于系统表中，只是作了标记】、用户指定不需要分析的列等不支持分析）      // 获取数据类型，并决定针对该类型的采样数据量和统计函数不同的类型，其分析函数也不同。如果从pg_type系统表中取出该类型的typanalyze字段不为空，则使用该类型指定的分析函数typanalyze（比如对于数组使用array_typanalyze）；否则使用标准分析函数std_typanalyze      if (vacattrstats[tcnt] != NULL) tcnt++;    }    attr_cnt = tcnt;  } else { // 全部所有的列    attr_cnt = onerel->rd_att->natts; tcnt = 0;    vacattrstats = (VacAttrStats **)palloc(attr_cnt * sizeof(VacAttrStats *));      for (i = 1; i <= attr_cnt; i++) {      vacattrstats[tcnt] = examine_attribute(onerel, i, NULL);      if (vacattrstats[tcnt] != NULL) tcnt++;    }    attr_cnt = tcnt;  }  /* Open all indexes of the relation, and see if there are any analyzable columns in the indexes.  We do not analyze index columns if there was an explicit column list in the ANALYZE command, however.  If we are doing a recursive scan, we don't want to touch the parent's indexes at all. */  if (!inh) vac_open_indexes(onerel, AccessShareLock, &nindexes, &Irel);  else{ // 如果我们做递归，不要去打开父表的索引    Irel = NULL;    nindexes = 0;  }  hasindex = (nindexes > 0); indexdata = NULL;  if (hasindex) { // 有索引    indexdata = (AnlIndexData *) palloc0(nindexes * sizeof(AnlIndexData));    for (ind = 0; ind < nindexes; ind++){ // 遍历每个索引      AnlIndexData *thisdata = &indexdata[ind];      IndexInfo  *indexInfo;      thisdata->indexInfo = indexInfo = BuildIndexInfo(Irel[ind]);      thisdata->tupleFract = 1.0; /* fix later if partial */      if (indexInfo->ii_Expressions != NIL && va_cols == NIL){        ListCell   *indexpr_item = list_head(indexInfo->ii_Expressions);        thisdata->vacattrstats = (VacAttrStats **)palloc(indexInfo->ii_NumIndexAttrs * sizeof(VacAttrStats *));        tcnt = 0;        for (i = 0; i < indexInfo->ii_NumIndexAttrs; i++){          int      keycol = indexInfo->ii_IndexAttrNumbers[i];          if (keycol == 0){                      Node     *indexkey; /* Found an index expression */            if (indexpr_item == NULL)  /* shouldn't happen */ elog(ERROR, "too few entries in indexprs list");            indexkey = (Node *) lfirst(indexpr_item);            indexpr_item = lnext(indexpr_item);            thisdata->vacattrstats[tcnt] = examine_attribute(Irel[ind], i + 1, indexkey);            if (thisdata->vacattrstats[tcnt] != NULL) tcnt++;          }        }        thisdata->attr_cnt = tcnt;      }    }  }

The sampling workflow of the do_analyze_rel function is as follows: use the worst case among all analyzable columns to determine the number of rows that need to be sampled; use the sampling function to obtain sampled rows; calculate statistics.

  /* Determine how many rows we need to sample, using the worst case from all analyzable columns.  We use a lower bound of 100 rows to avoid possible overflow in Vitter's algorithm.  (Note: that will also be the target in the corner case where there are no analyzable columns.) */ // 使用所有可分析列中的最坏情况，确定需要采样的行数。我们使用100行的下限来避免Vitter算法中可能的溢出。  targrows = 100;  for (i = 0; i < attr_cnt; i++) { // 表列    if (targrows < vacattrstats[i]->minrows) targrows = vacattrstats[i]->minrows; // 100行的下限  }  for (ind = 0; ind < nindexes; ind++) { // 索引列    AnlIndexData *thisdata = &indexdata[ind];    for (i = 0; i < thisdata->attr_cnt; i++){      if (targrows < thisdata->vacattrstats[i]->minrows) targrows = thisdata->vacattrstats[i]->minrows; // 100行的下限    }  }
  /* Acquire the sample rows */  rows = (HeapTuple *) palloc(targrows * sizeof(HeapTuple)); // 分配目标行*sizeof(HeapTuple)内存空间  if (inh) numrows = acquire_inherited_sample_rows(onerel, elevel, rows, targrows, &totalrows, &totaldeadrows); // 继承表，进入acquire_inherited_sample_rows函数会递归整个分析流程  else numrows = (*acquirefunc) (onerel, elevel, rows, targrows,  &totalrows, &totaldeadrows);
  /* Compute the statistics.  Temporary results during the calculations for each column are stored in a child context.  The calc routines are responsible to make sure that whatever they store into the VacAttrStats structure is allocated in anl_context. */ // 计算统计数据。每列计算期间的临时结果存储在子上下文中。calc例程负责确保它们存储到VacationrStats结构中的任何内容都在anl_context中分配。  if (numrows > 0){    MemoryContext col_context,old_context;    col_context = AllocSetContextCreate(anl_context,"Analyze Column",ALLOCSET_DEFAULT_SIZES);    old_context = MemoryContextSwitchTo(col_context);    for (i = 0; i < attr_cnt; i++){ // 对表的列      VacAttrStats *stats = vacattrstats[i];      AttributeOpts *aopt;      stats->rows = rows;      stats->tupDesc = onerel->rd_att;      stats->compute_stats(stats, std_fetch_func, numrows, totalrows);      /* If the appropriate flavor of the n_distinct option is specified, override with the corresponding value. */      aopt = get_attribute_options(onerel->rd_id, stats->attr->attnum);      if (aopt != NULL){        float8    n_distinct;        n_distinct = inh ? aopt->n_distinct_inherited : aopt->n_distinct;        if (n_distinct != 0.0) stats->stadistinct = n_distinct;      }      MemoryContextResetAndDeleteChildren(col_context);    }    if (hasindex) compute_index_stats(onerel, totalrows,indexdata, nindexes,rows, numrows,col_context);    MemoryContextSwitchTo(old_context);    MemoryContextDelete(col_context);

The do_analyze_rel function update statistics flow as follows: send completed statistics row to pg_statistic, replacing any previous statistics for the target column; build extended statistics; update page/tuple statistics in pg_class; report analysis to statistics collector

    /* Emit the completed stats rows into pg_statistic, replacing any previous statistics for the target columns.  (If there are stats in pg_statistic for columns we didn't process, we leave them alone.) */ // 将完成的统计数据行发送到pg_statistic，替换目标列的任何以前的统计数据。（如果pg_statistic中有我们没有处理的列的统计信息，我们就不处理它们。）    update_attstats(RelationGetRelid(onerel), inh, attr_cnt, vacattrstats);    for (ind = 0; ind < nindexes; ind++){      AnlIndexData *thisdata = &indexdata[ind];      update_attstats(RelationGetRelid(Irel[ind]), false, thisdata->attr_cnt, thisdata->vacattrstats);    }    /* Build extended statistics (if there are any). For now we only build extended statistics on individual relations, not for relations representing inheritance trees. */ // 构建扩展统计数据（如果有）。目前，我们只对单个关系构建扩展统计数据，而不是表示继承树的关系    if (!inh) BuildRelationExtStatistics(onerel, totalrows, numrows, rows,  attr_cnt, vacattrstats);  }
  /* Update pages/tuples stats in pg_class ... but not if we're doing inherited stats. */ // 更新pg_class中的页面/元组统计  if (!inh) {    BlockNumber relallvisible;    visibilitymap_count(onerel, &relallvisible, NULL);    vac_update_relstats(onerel,relpages,totalrows,relallvisible,hasindex,InvalidTransactionId,InvalidMultiXactId,in_outer_xact);  }  /* Same for indexes. Vacuum always scans all indexes, so if we're part of VACUUM ANALYZE, don't overwrite the accurate count already inserted by VACUUM. */  if (!inh && !(params->options & VACOPT_VACUUM)){    for (ind = 0; ind < nindexes; ind++){      AnlIndexData *thisdata = &indexdata[ind];      double    totalindexrows;      totalindexrows = ceil(thisdata->tupleFract * totalrows);      vac_update_relstats(Irel[ind],RelationGetNumberOfBlocks(Irel[ind]),totalindexrows,0,false,InvalidTransactionId,InvalidMultiXactId,in_outer_xact);    }  }  /* Report ANALYZE to the stats collector, too.  However, if doing inherited stats we shouldn't report, because the stats collector only tracks per-table stats.  Reset the changes_since_analyze counter only if we analyzed all columns; otherwise, there is still work for auto-analyze to do. */ // 也向统计收集器报告分析。但是，如果执行继承的统计，我们不应该报告，因为统计收集器只跟踪每个表的统计。仅当我们分析了所有列时，重置changes_since_analyze计数器；否则，自动分析仍有工作要做  if (!inh) pgstat_report_analyze(onerel, totalrows, totaldeadrows, (va_cols == NIL));