该建筑在通过2D阵列以发挥功能时具有优势。
I wouldn t call it a VLA
. To me, a VLA is:
void
caller(int n)
{
float arr[n];
}
I d (oosely) calls it, a pointer to an range with n
s:
void
caller(int n)
{
float (*arr)[n] = malloc(sizeof(float) * n);
float (*arr2d)[n] = malloc(sizeof(float) * n * n);
}
如果没有特别协调人,我们将不得不人工处理浏览指数:
void
process(int w,int h,float *arr)
{
for (int y = 0; y < h; ++y) {
for (int x = 0; x < w; ++x)
arr[(y * w) + x] = (y * w) + x;
}
}
With the special pointer, we pass along dimensions:
void
process(int w,int h,float (*arr)[w])
{
for (int y = 0; y < h; ++y) {
for (int x = 0; x < w; ++x)
arr[y][x] = (y * w) + x;
}
}
例如,这里是最近对地雷的回答,我把“Real”2D阵列:
指出,我们可以发挥类似的作用:
void
process(int w,int h,float arr[h][w])
{
for (int y = 0; y < h; ++y) {
for (int x = 0; x < w; ++x)
arr[y][x] = (y * w) + x;
}
}
但我要说的是,这既无成事实。 它对我来说,它“像”是一种万物,但就我而言,因为它是一个参数,它是为了描述打电话者通过点子。
然而,尽管我们能够利用这一辛迪加来进行争辩,但我们也不能在同一职能中使用这一.。 我们需要特别提要:
void
process(int w,int h)
{
float (*arr)[w] = malloc(sizeof(float) * w * h);
for (int y = 0; y < h; ++y) {
for (int x = 0; x < w; ++x)
arr[y][x] = (y * w) + x;
}
free(arr);
}
我们的支持者不希望把三个论点传给想要操纵阵列的每一项职能。
我们可以建立一个带宽度、高度和阵列点的<条码>。 那么,我们就能够把方向推向前进方向:
#include <stdlib.h>
struct arr2d {
int w;
int h;
float *data;
};
void
process1(struct arr2d *arr)
{
for (int y = 0; y < arr->h; ++y) {
for (int x = 0; x < arr->w; ++x)
arr->data[(y * arr->w) + x] = (y * arr->w) + x;
}
}
void
process2(struct arr2d *arr)
{
float (*data)[arr->w] = (void *) arr->data;
for (int y = 0; y < arr->h; ++y) {
for (int x = 0; x < arr->w; ++x)
data[y][x] = (y * arr->w) + x;
}
}
void
process3(struct arr2d *arr)
{
int w = arr->w;
int h = arr->h;
float (*data)[w] = (void *) arr->data;
for (int y = 0; y < h; ++y) {
for (int x = 0; x < w; ++x)
data[y][x] = (y * w) + x;
}
}
struct arr2d *
new2d(int w,int h)
{
struct arr2d *arr = malloc(sizeof(*arr));
arr->w = w;
arr->h = h;
arr->data = malloc(sizeof(arr->data[0]) * w * h);
return arr;
}
<<>BENCHMARKS>:
我很想知道各种职能和指数化方法如何迅速发挥作用。 为了尽量减少副作用,仅衡量各种指数方法的影响,我从<代码>float改为int
,并增加了一个简单的<代码>idx变量。
这里是基准代码:
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
struct arr2d {
int w;
int h;
int *data;
};
void
process1(struct arr2d *arr)
{
int idx = 0;
for (int y = 0; y < arr->h; ++y) {
for (int x = 0; x < arr->w; ++x, ++idx)
arr->data[(y * arr->w) + x] = idx;
}
}
void
process2(struct arr2d *arr)
{
int (*data)[arr->w] = (void *) arr->data;
int idx = 0;
for (int y = 0; y < arr->h; ++y) {
for (int x = 0; x < arr->w; ++x, ++idx)
data[y][x] = idx;
}
}
void
process3(struct arr2d *arr)
{
int w = arr->w;
int h = arr->h;
int (*data)[w] = (void *) arr->data;
int idx = 0;
for (int y = 0; y < h; ++y) {
for (int x = 0; x < w; ++x, ++idx)
data[y][x] = idx;
}
}
void
process4(struct arr2d *arr)
{
int w = arr->w;
int h = arr->h;
int (*data)[w] = (void *) arr->data;
int idx = 0;
for (int y = 0; y < h; ++y) {
int *row = data[y];
for (int x = 0; x < w; ++x, ++idx)
row[x] = idx;
}
}
struct arr2d *
new2d(int w,int h)
{
struct arr2d *arr = malloc(sizeof(*arr));
arr->w = w;
arr->h = h;
arr->data = malloc(sizeof(arr->data[0]) * w * h);
return arr;
}
void
del2d(struct arr2d *arr)
{
free(arr->data);
free(arr);
}
double
tscgetf(void)
{
struct timespec ts;
double sec;
clock_gettime(CLOCK_MONOTONIC,&ts);
sec = ts.tv_nsec;
sec /= 1e9;
sec += ts.tv_sec;
return sec;
}
#define DOFNC(_fnc)
dofnc(arr,_fnc,#_fnc)
double tscold;
void
dofnc(struct arr2d *arr,void (*fnc)(struct arr2d *arr),const char *sym)
{
double tscbest = 1LL << 60;
for (int iterbest = 0; iterbest < 5; ++iterbest) {
double tscbeg = tscgetf();
for (int iterfnc = 0; iterfnc < 100; ++iterfnc)
fnc(arr);
double tscend = tscgetf();
tscend -= tscbeg;
if (tscend < tscbest)
tscbest = tscend;
}
if (tscold == 0)
tscold = tscbest;
printf("%.9f %.3fx %s
",tscbest,tscold / tscbest,sym);
}
void
dotest(int w,int h)
{
struct arr2d *arr = new2d(w,h);
printf("
");
printf("dotest: w=%d h=%d
",w,h);
tscold = 0;
DOFNC(process1);
DOFNC(process2);
DOFNC(process3);
DOFNC(process4);
del2d(arr);
}
int
main(void)
{
dotest(100,50);
dotest(50,100);
dotest(100,1000);
dotest(1000,100);
dotest(10000,10);
return 0;
}
我将基准分为三个优化等级:-O0
,-O2
和-O3
。 The benckmarks used process1
as the base. 成果如下:
--------------------------------------------------------------------------------
optimize -O0
dotest: w=100 h=50
0.003704862 1.000x process1
0.001824568 2.031x process2
0.001825894 2.029x process3
0.001562364 2.371x process4
dotest: w=50 h=100
0.003504016 1.000x process1
0.001778585 1.970x process2
0.001773583 1.976x process3
0.001588978 2.205x process4
dotest: w=100 h=1000
0.074285246 1.000x process1
0.036595154 2.030x process2
0.034042392 2.182x process3
0.032144590 2.311x process4
dotest: w=1000 h=100
0.070179764 1.000x process1
0.033196725 2.114x process2
0.033003560 2.126x process3
0.031031687 2.262x process4
dotest: w=10000 h=10
0.069901074 1.000x process1
0.033057440 2.115x process2
0.032876559 2.126x process3
0.028973187 2.413x process4
--------------------------------------------------------------------------------
optimize -O2
dotest: w=100 h=50
0.000613288 1.000x process1
0.000389683 1.574x process2
0.000377139 1.626x process3
0.000377149 1.626x process4
dotest: w=50 h=100
0.000575807 1.000x process1
0.000363035 1.586x process2
0.000346855 1.660x process3
0.000346888 1.660x process4
dotest: w=100 h=1000
0.011875981 1.000x process1
0.007884906 1.506x process2
0.007599107 1.563x process3
0.007596456 1.563x process4
dotest: w=1000 h=100
0.011400999 1.000x process1
0.007180440 1.588x process2
0.007008333 1.627x process3
0.006975097 1.635x process4
dotest: w=10000 h=10
0.011140902 1.000x process1
0.007144627 1.559x process2
0.006928350 1.608x process3
0.006919723 1.610x process4
--------------------------------------------------------------------------------
optimize -O3
dotest: w=100 h=50
0.000584456 1.000x process1
0.000534780 1.093x process2
0.000094556 6.181x process3
0.000096305 6.069x process4
dotest: w=50 h=100
0.000564249 1.000x process1
0.000377867 1.493x process2
0.000107199 5.264x process3
0.000140041 4.029x process4
dotest: w=100 h=1000
0.011978588 1.000x process1
0.010904583 1.098x process2
0.002855820 4.194x process3
0.002877185 4.163x process4
dotest: w=1000 h=100
0.011205538 1.000x process1
0.010515848 1.066x process2
0.002302689 4.866x process3
0.002076997 5.395x process4
dotest: w=10000 h=10
0.011123261 1.000x process1
0.010506258 1.059x process2
0.001983195 5.609x process3
0.001975420 5.631x process4
预计<代码>3将加快,process4
将略为加快。
现有:
- The amount of speedup of
process3
with -O3
.
- For the other optimization levels, it was 1.5x-2.0x faster than
process1
.
- For
-O3
, it was 4x-6x faster than process1
.
process4
was slightly slower than process3
in some instances.
看看一些分辨率,它发现,使用<代码>-O3的优化器能够使用x86 SIMD指令,其编号为process3
和4
。
我猜测这只是表明,“最佳地位”可以带来一些令人惊讶的结果。