Fixed few issues in SAO arm assemblies

There were few mismatches seen because of wrong clipping and wrong increments in SAO assemblies Change-Id: I8ab28d847b1708b6949eac514f99e475e792cde1

Fixed few issues in SAO arm assemblies
There were few mismatches seen because of wrong clipping and wrong increments in SAO assemblies Change-Id: I8ab28d847b1708b6949eac514f99e475e792cde1
489e8e68 · Harish Mahendrakar · 3426916a · 489e8e68 · 489e8e68 · 489e8e68
Commit 489e8e68 authored 8 years ago by Harish Mahendrakar
7 changed files
--- a/common/arm/ihevc_sao_edge_offset_class2_chroma.s
+++ b/common/arm/ihevc_sao_edge_offset_class2_chroma.s
@@ -829,6 +829,10 @@ SRC_LEFT_LOOP_WD_16_HT_4:

    SUBS        r6,r6,#16                   @Decrement the wd loop count by 16
    BLE         RE_ASSINING_LOOP            @Jump to re-assigning loop
+    LDR         r7,[sp,#0x114]              @Loads wd
+    LDR         r0,[sp,#0x02]               @Loads *pu1_src
+    SUB         r7,r7,r6
+    ADD         r0,r0,r7
    BGT         WD_16_HT_4_LOOP



--- a/common/arm/ihevc_sao_edge_offset_class3.s
+++ b/common/arm/ihevc_sao_edge_offset_class3.s
@@ -691,6 +691,10 @@ SRC_LEFT_LOOP_WD_16_HT_4:

    SUBS        r6,r6,#16                   @Decrement the wd loop count by 16
    BLE         RE_ASSINING_LOOP            @Jump to re-assigning loop
+    LDR         r7,[sp,#0xD0]               @Loads wd
+    LDR         r0,[sp,#0x90]               @Loads *pu1_src
+    SUB         r7,r7,r6
+    ADD         r0,r0,r7
    BGT         WD_16_HT_4_LOOP             @If not equal jump to width_loop



--- a/common/arm/ihevc_sao_edge_offset_class3_chroma.s
+++ b/common/arm/ihevc_sao_edge_offset_class3_chroma.s
@@ -851,6 +851,10 @@ SRC_LEFT_LOOP_WD_16_HT_4:

    SUBS        r6,r6,#16                   @Decrement the wd loop count by 16
    BLE         RE_ASSINING_LOOP            @Jump to re-assigning loop
+    LDR         r7,[sp,#0x114]              @Loads wd
+    LDR         r0,[sp,#0x02]               @Loads *pu1_src
+    SUB         r7,r7,r6
+    ADD         r0,r0,r7
    BGT         WD_16_HT_4_LOOP             @If not equal jump to width_loop

 WIDTH_RESIDUE:

--- a/common/arm64/ihevc_sao_edge_offset_class2.s
+++ b/common/arm64/ihevc_sao_edge_offset_class2.s
@@ -146,6 +146,9 @@ PU1_AVAIL_4_LOOP:
    mov         x20,#255
    cmp         x9,x20
    csel        x9, x20, x9, ge             //u1_pos_0_0_tmp = CLIP3(pu1_src[0] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
+    mov         x20,#0
+    cmp         x9,x20
+    csel        x9, x20, x9, LT             //u1_pos_0_0_tmp = CLIP3(pu1_src[0] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)

 PU1_AVAIL_7_LOOP:
    LDRB        w14,[x5,#7]                 //pu1_avail[7]
@@ -190,6 +193,9 @@ PU1_AVAIL_7_LOOP:
    mov         x20,#255
    cmp         x10,x20
    csel        x10, x20, x10, ge           //u1_pos_wd_ht_tmp = CLIP3(pu1_src[wd - 1 + (ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
+    mov         x20,#0
+    cmp         x10,x20
+    csel        x10, x20, x10, LT           //u1_pos_wd_ht_tmp = CLIP3(pu1_src[wd - 1 + (ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)

 PU1_AVAIL:
    MOV         x12,x8                      //Move ht

--- a/common/arm64/ihevc_sao_edge_offset_class2_chroma.s
+++ b/common/arm64/ihevc_sao_edge_offset_class2_chroma.s
@@ -165,6 +165,9 @@ PU1_AVAIL_4_LOOP_U:
    mov         x20,#255
    cmp         x9,x20
    csel        x9, x20, x9, ge             //u1_pos_0_0_tmp_u = CLIP3(pu1_src[0] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
+    mov         x20,#0
+    cmp         x9,x20
+    csel        x9, x20, x9, LT             //u1_pos_0_0_tmp_u = CLIP3(pu1_src[0] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)

 PU1_AVAIL_4_LOOP_V:

@@ -201,6 +204,9 @@ PU1_AVAIL_4_LOOP_V:
    mov         x20,#255
    cmp         x10,x20
    csel        x10, x20, x10, ge           //u1_pos_0_0_tmp_v = CLIP3(pu1_src[0] + pi1_sao_offset_v[edge_idx], 0, (1 << bit_depth) - 1)
+    mov         x20,#0
+    cmp         x10,x20
+    csel        x10, x20, x10, LT           //u1_pos_0_0_tmp_v = CLIP3(pu1_src[0] + pi1_sao_offset_v[edge_idx], 0, (1 << bit_depth) - 1)

 PU1_AVAIL_7_LOOP_U:
    STRB        w10,[sp,#7]
@@ -249,6 +255,9 @@ PU1_AVAIL_7_LOOP_U:
    mov         x20,#255
    cmp         x10,x20
    csel        x10, x20, x10, ge           //u1_pos_wd_ht_tmp = CLIP3(pu1_src[wd - 1 + (ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
+    mov         x20,#0
+    cmp         x10,x20
+    csel        x10, x20, x10, LT           //u1_pos_wd_ht_tmp = CLIP3(pu1_src[wd - 1 + (ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)

 PU1_AVAIL_7_LOOP_V:
    ADD         x12,x12,#1
@@ -286,6 +295,9 @@ PU1_AVAIL_7_LOOP_V:
    mov         x20,#255
    cmp         x9,x20
    csel        x9, x20, x9, ge             //u1_pos_wd_ht_tmp_v = CLIP3(pu1_src[wd - 1 + (ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
+    mov         x20,#0
+    cmp         x9,x20
+    csel        x9, x20, x9, LT             //u1_pos_wd_ht_tmp_v = CLIP3(pu1_src[wd - 1 + (ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)

 PU1_AVAIL_3_LOOP:
    STRB        w10,[sp,#8]
@@ -924,6 +936,10 @@ SRC_LEFT_LOOP_WD_16_HT_4:

    SUBS        x6,x6,#16                   //Decrement the wd loop count by 16
    BLE         RE_ASSINING_LOOP            //Jump to re-assigning loop
+    mov         w7, w24                     //Loads wd
+    mov         x0, x27                     //Loads *pu1_src
+    SUB         x7,x7,x6
+    ADD         x0,x0,x7
    BGT         WD_16_HT_4_LOOP



--- a/common/arm64/ihevc_sao_edge_offset_class3.s
+++ b/common/arm64/ihevc_sao_edge_offset_class3.s
@@ -151,6 +151,9 @@ PU1_AVAIL_5_LOOP:
    mov         x20,#255
    cmp         x9,x20
    csel        x9, x20, x9, ge             //u1_pos_0_0_tmp = CLIP3(pu1_src[0] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
+    mov         x20,#0
+    cmp         x9,x20
+    csel        x9, x20, x9, LT             //u1_pos_0_0_tmp = CLIP3(pu1_src[0] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)

 PU1_AVAIL_6_LOOP:
    LDRB        w10,[x5,#6]                 //pu1_avail[6]
@@ -198,6 +201,9 @@ PU1_AVAIL_6_LOOP:
    mov         x20,#255
    cmp         x10,x20
    csel        x10, x20, x10, ge           //u1_pos_wd_ht_tmp = CLIP3(pu1_src[(ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
+    mov         x20,#0
+    cmp         x10,x20
+    csel        x10, x20, x10, LT           //u1_pos_wd_ht_tmp = CLIP3(pu1_src[(ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)

 PU1_AVAIL_3_LOOP:
    MOV         x21,x2
@@ -713,6 +719,10 @@ SRC_LEFT_LOOP_WD_16_HT_4:

    SUBS        x6,x6,#16                   //Decrement the wd loop count by 16
    BLE         RE_ASSINING_LOOP            //Jump to re-assigning loop
+    MOV         x7,x16                      //Loads wd
+    MOV         x0,x15                      //Loads *pu1_src
+    SUB         x7,x7,x6
+    ADD         x0,x0,x7
    BGT         WD_16_HT_4_LOOP             //If not equal jump to width_loop



--- a/common/arm64/ihevc_sao_edge_offset_class3_chroma.s
+++ b/common/arm64/ihevc_sao_edge_offset_class3_chroma.s
@@ -160,6 +160,9 @@ PU1_AVAIL_5_LOOP_U:
    mov         x20,#255
    cmp         x9,x20
    csel        x9, x20, x9, ge             //u1_pos_0_0_tmp_u = CLIP3(pu1_src[wd - 2] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
+    mov         x20,#0
+    cmp         x9,x20
+    csel        x9, x20, x9, LT             //u1_pos_0_0_tmp_u = CLIP3(pu1_src[wd - 2] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)

 PU1_AVAIL_5_LOOP_V:

@@ -194,6 +197,9 @@ PU1_AVAIL_5_LOOP_V:
    mov         x20,#255
    cmp         x10,x20
    csel        x10, x20, x10, ge           //u1_pos_0_0_tmp_v = CLIP3(pu1_src[wd - 1] + pi1_sao_offset_v[edge_idx], 0, (1 << bit_depth) - 1)
+    mov         x20,#0
+    cmp         x10,x20
+    csel        x10, x20, x10, LT           //u1_pos_0_0_tmp_v = CLIP3(pu1_src[wd - 1] + pi1_sao_offset_v[edge_idx], 0, (1 << bit_depth) - 1)

 PU1_AVAIL_6_LOOP_U:
    STRB        w9,[sp,#6]
@@ -240,6 +246,9 @@ PU1_AVAIL_6_LOOP_U:
    mov         x20,#255
    cmp         x10,x20
    csel        x10, x20, x10, ge           //u1_pos_wd_ht_tmp = CLIP3(pu1_src[(ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
+    mov         x20,#0
+    cmp         x10,x20
+    csel        x10, x20, x10, LT           //u1_pos_wd_ht_tmp = CLIP3(pu1_src[(ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)

 PU1_AVAIL_6_LOOP_V:
    ADD         x12,x12,#1                  //pu1_src[(ht - 1) * src_strd + 1]
@@ -276,6 +285,9 @@ PU1_AVAIL_6_LOOP_V:
    mov         x20,#255
    cmp         x9,x20
    csel        x9, x20, x9, ge             //u1_pos_wd_ht_tmp_v = CLIP3(pu1_src[(ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
+    mov         x20,#0
+    cmp         x9,x20
+    csel        x9, x20, x9, LT             //u1_pos_wd_ht_tmp_v = CLIP3(pu1_src[(ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)

 PU1_AVAIL_3_LOOP:
    STRB        w10,[sp,#8]
@@ -933,6 +945,10 @@ SRC_LEFT_LOOP_WD_16_HT_4:

    SUBS        x6,x6,#16                   //Decrement the wd loop count by 16
    BLE         RE_ASSINING_LOOP            //Jump to re-assigning loop
+    mov         w7, w24                     //Loads wd
+    mov         x0, x28                     //Loads *pu1_src
+    SUB         x7,x7,x6
+    ADD         x0,x0,x7
    BGT         WD_16_HT_4_LOOP             //If not equal jump to width_loop

 WIDTH_RESIDUE: