<code>
代码原型,不是我写的。只是拿来copy。
static LA_bool xxx(unsigned char *src, int Width, int Height, short *table1, short *table2,
int dstWidth, int dstHeight, int nchanner, unsigned char *dst, Rect_S stRect)
{
int sx, sy;
int i, j;
int stepSrc = Width * nchanner;
int stepDstMapxy = dstWidth;
int stepDstMapCoef = dstWidth * 2;
short cof00, cof01, cof10, cof11;
int offset1, offset2;
//int r, g, b;
int dstoff = 0;
int coflinestart;
int dstlinestart;
int *xy_tab = (int *)table1;
int *cof_tab = (int *)table2;
int xyval = 0;
int xyoff, coff;
int cofval1, cofval2;
unsigned char r00, g00, b00, r01, g01, b01, r10, g10, b10, r11, g11, b11;
char *p1, *p2;
int rgb0, rgb1, rgb2, rgb3;
unsigned char *pDtmp;
for (j = stRect.top; j <stRect.bottom; ++j)
{
coflinestart = j*stepDstMapCoef;
dstlinestart = j*dstWidth * 3;
pDtmp = dst + dstlinestart + stRect.left * 3;
for (i = stRect.left; i <stRect.right; ++i)
{
xyoff = j*stepDstMapxy + i;
xyval = *(xy_tab + xyoff);
sy = (xyval>> 16) & 0x0000ffff;
sx = xyval & 0x0000ffff;
coff = coflinestart + (i <<1);
cofval1 = *(cof_tab + coff);
cofval2 = *(cof_tab + 1 + coff);
cof01 = (cofval1>> 16) & 0x0000ffff;
cof00 = cofval1 & 0x0000ffff;
cof11 = (cofval2>> 16) & 0x0000ffff;
cof10 = cofval2 & 0x0000ffff;
offset1 = (sy * stepSrc + sx*nchanner);
offset2 = offset1 + stepSrc;
p1 = (char *)(src + offset1);
p2 = (char *)(src + offset2);
r00 = *p1++;
g00 = *p1++;
b00 = *p1++;
r01 = *p1++;
g01 = *p1++;
b01 = *p1++;
r10 = *p2++;
g10 = *p2++;
b10 = *p2++;
r11 = *p2++;
g11 = *p2++;
b11 = *p2++;
*(pDtmp++) = (unsigned char)((r00 * cof00 + r10 * cof01 + r01 * cof10 + r11 * cof11)>> BITOFF);
*(pDtmp++) = (unsigned char)((g00 * cof00 + g10 * cof01 + g01 * cof10 + g11 * cof11)>> BITOFF);
*(pDtmp++) = (unsigned char)((b00 * cof00 + b10 * cof01 + b01 * cof10 + b11 * cof11)>> BITOFF);
}
}
return 1;
}
</code>
==== 优化后代码 ====
<code>
static int ImgIn_w = 800;
static int ImgIn_h = 600;
static int Img_w = 600;
static int Img_h = 800;
static int PixelSize = 2;
static void Load_Dot16_YUV422(int stRect_left, int stRect_right, int stRect_top, int stRect_bottom, int ImageIndex, int ImageOutIndex)
{
unsigned int src0 = pImgS1In;
unsigned int dst0 = pImgS1Out;
unsigned int xy_tab0 = pParam_Loadx4XY;
unsigned int cof_tab0 = pParam_Dot16Cof;
int Index_i, Index_j;
unsigned char* src;
unsigned char * src1;
register tu32 XY_tab_Addr;
unsigned long long *pXY_tab;
int i_LoopNum;
int j_offset;
unsigned long long offset_12;
int offset_load_1;
int offset_load_1_is;
unsigned long long data_load_A_1;
unsigned long long data_load_B_1;
int offset_load_2;
int offset_load_2_is;
unsigned long long data_load_A_2;
unsigned long long data_load_B_2;
unsigned long long *pCof_tab;
unsigned long long *pCof_tab1;
unsigned long long Cof_ABCD_1;
unsigned long long Cof_ABCD_2;
unsigned long long Data64_AND_00FF = 0x00FF00FF00FF00FF;
unsigned int Data32_7654_A_1;
unsigned int Data32_7654_B_1;
unsigned int Data32_3210_A_2;
unsigned int Data32_3210_B_2;
unsigned long long Data64_DP2_76765454_1;
unsigned long long Data64_DP2_32321010_2;
unsigned long long Data64_DPH4_75753131_1;
unsigned long long Data64_DPH4_75753131_2;
unsigned long long Data64_MV55_75753131_1;
unsigned long long Data64_MV33_75753131_2;
unsigned long long Data64_SHFU_75753131_1;
unsigned long long Data64_SHFU_75753131_2;
unsigned int Data32_7575_1;
unsigned int Data32_3131_1;
unsigned int Data32_7575_2;
unsigned int Data32_3131_2;
int Shift_Num_1;
int Shift_Num_2;
//AB
//CD
unsigned long long Data64_Y0_0B0D0A0C;
unsigned long long Data64_Y1_0B0D0A0C;
unsigned long long Data64_U0_0B0D0A0C;
unsigned long long Data64_V1_0B0D0A0C;
__x128_t D128_Y1Y0_0B0D0A0C;
__x128_t D128_V1U0_0B0D0A0C;
__x128_t D128_C1C0_0B0D0A0C;
unsigned long long D64_Y1Y0_Dot16;
unsigned long long D64_V1U0_Dot16;
unsigned long long D64_Y1Y0_SHRU;
unsigned long long D64_V1U0_SHRU;
unsigned long long D64_V1_Y1_U0_Y0;
unsigned int D32_V1_Y1;
unsigned int D32_U0_Y0;
unsigned int D32_V1Y1U0Y0;
src = src0;
src1 = src0 + ImgIn_w * PixelSize;
//图层偏址
xy_tab0+=Img_h*Img_w*4*ImageIndex;
cof_tab0+=Img_h*Img_w*(1<<3)*ImageIndex;
dst0 +=Img_h*Img_w*2*ImageOutIndex;
//windows偏址
{
j_offset = stRect_top*Img_w;
xy_tab0 = xy_tab0 + (j_offset<<2);
cof_tab0= cof_tab0 + (j_offset<<3);
dst0 = dst0 + (j_offset<<1);
}
//line偏址
{
j_offset = stRect_left;
xy_tab0 = xy_tab0 + (j_offset<<2);
cof_tab0= cof_tab0 + (j_offset<<3);
dst0 = dst0 + (j_offset<<1);
}
//行循环
i_LoopNum = stRect_right-stRect_left;
i_LoopNum = i_LoopNum>>1;
for (Index_j = stRect_top; Index_j <stRect_bottom; Index_j++)
{
//pre_init
XY_tab_Addr = xy_tab0;
pXY_tab = XY_tab_Addr;
xy_tab0+=(Img_w<<2);
pCof_tab = cof_tab0;
cof_tab0+=(Img_w<<3);
pDtmp = dst0;
dst0+=(Img_w<<1);
//pre_Loop
{
pCof_tab1 = &pCof_tab[1];
offset_12 = *pXY_tab++;//C1
offset_load_1 = _loll(offset_12);//C1
offset_load_2 = _hill(offset_12);//C1
data_load_A_1 = _mem8(src+offset_load_1);//C1
data_load_B_1 = _mem8(src1+offset_load_1);//C1
data_load_A_2 = _mem8(src+offset_load_2);//C1
data_load_B_2 = _mem8(src1+offset_load_2);//C1
offset_load_1_is = offset_load_1&2;//C1
offset_load_2_is = offset_load_2&2;//C1
offset_12 = *pXY_tab++;//C2
}
for (Index_i =0; Index_i <i_LoopNum+1; Index_i++)
{
Data32_7654_A_1 = _hill(data_load_A_1);
Data32_7654_B_1 = _hill(data_load_B_1);
Data64_DP2_76765454_1 = _dpack2(Data32_7654_B_1, Data32_7654_A_1);
Data32_3210_A_2 = _loll(data_load_A_2);
Data32_3210_B_2 = _loll(data_load_B_2);
Data64_DP2_32321010_2 = _dpack2(Data32_3210_B_2, Data32_3210_A_2);
Data64_DPH4_75753131_1 = _dpackh4(data_load_B_1, data_load_A_1); //76543210 76543210>7575 3131
Data64_DPH4_75753131_2 = _dpackh4(data_load_B_2, data_load_A_2); //76543210 76543210>7575 3131
Data32_7575_1 = _hill(Data64_DPH4_75753131_1);
Data32_3131_1 = _loll(Data64_DPH4_75753131_1);
Shift_Num_1 = 8;
if(offset_load_1_is==0)
{
Data32_3131_1 = Data32_7575_1;
Shift_Num_1 = 0;
}
Data64_MV55_75753131_1 = _itoll(Data32_7575_1, Data32_3131_1);
Data64_SHFU_75753131_1 = _dshru(Data64_MV55_75753131_1, Shift_Num_1);
Data32_7575_2 = _hill(Data64_DPH4_75753131_2);
Data32_3131_2 = _loll(Data64_DPH4_75753131_2);
Shift_Num_2 = 0;
if(offset_load_2_is==0)
{
Data32_7575_2 = Data32_3131_2;
Shift_Num_2 = 8;
}
Data64_MV33_75753131_2 = _itoll(Data32_7575_2, Data32_3131_2);
Data64_SHFU_75753131_2 = _dshru(Data64_MV33_75753131_2, Shift_Num_2);
Data64_Y0_0B0D0A0C = Data64_DP2_76765454_1 & Data64_AND_00FF;
Data64_Y1_0B0D0A0C = Data64_DP2_32321010_2 & Data64_AND_00FF;
Data64_U0_0B0D0A0C = Data64_SHFU_75753131_1 & Data64_AND_00FF;
Data64_V1_0B0D0A0C = Data64_SHFU_75753131_2 & Data64_AND_00FF;
//这里是循环优化begin
offset_load_1 = _loll(offset_12);//C2
offset_load_2 = _hill(offset_12);//C2
data_load_A_1 = _mem8(src+offset_load_1);//C2
data_load_B_1 = _mem8(src1+offset_load_1);//C2
data_load_A_2 = _mem8(src+offset_load_2);//C2
data_load_B_2 = _mem8(src1+offset_load_2);//C2
offset_load_1_is = offset_load_1&2;//C2
offset_load_2_is = offset_load_2&2;//C2
offset_12 = *pXY_tab++;//C3
//这里是循环优化end
//-----------------
Cof_ABCD_1 = *pCof_tab;pCof_tab+=2;
Cof_ABCD_2 = *pCof_tab1;pCof_tab1+=2;
D128_C1C0_0B0D0A0C = _llto128(Cof_ABCD_2, Cof_ABCD_1);
D128_Y1Y0_0B0D0A0C = _llto128(Data64_Y1_0B0D0A0C, Data64_Y0_0B0D0A0C);
D128_V1U0_0B0D0A0C = _llto128(Data64_V1_0B0D0A0C, Data64_U0_0B0D0A0C);
D64_Y1Y0_SHRU = _dshr(D64_Y1Y0_Dot16, BITOFF);//dot延迟,流水线输出超越处理
D64_V1U0_SHRU = _dshr(D64_V1U0_Dot16, BITOFF);
D64_Y1Y0_Dot16 = _ddotpsu4h(D128_Y1Y0_0B0D0A0C, D128_C1C0_0B0D0A0C);
D64_V1U0_Dot16 = _ddotpsu4h(D128_V1U0_0B0D0A0C, D128_C1C0_0B0D0A0C);
D64_V1_Y1_U0_Y0 = _dpackl4(D64_V1U0_SHRU, D64_Y1Y0_SHRU);
D32_V1_Y1 = _hill(D64_V1_Y1_U0_Y0);
D32_U0_Y0 = _loll(D64_V1_Y1_U0_Y0);
D32_V1Y1U0Y0 = _packl4(D32_V1_Y1, D32_U0_Y0);
if(Index_i)
{
*pDtmp++ = D32_V1Y1U0Y0;
}
//-----------------
}
}
return ;
}
</code>