Commit 0d8951ce authored by Harish Mahendrakar's avatar Harish Mahendrakar Committed by Lajos Molnar
Browse files

Initial Version of HEVC decoder

Compliant to reference software HM11.0 onwards

Bug: 14571712
Change-Id: I8af25c1221cc6ab70440141c4d9b48c1ac69696a
parent 446ae524
LOCAL_PATH := $(call my-dir)
include $(CLEAR_VARS)
# decoder
include $(LOCAL_PATH)/decoder.mk
Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at:
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
@/*****************************************************************************
@*
@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
@*
@* Licensed under the Apache License, Version 2.0 (the "License");
@* you may not use this file except in compliance with the License.
@* You may obtain a copy of the License at:
@*
@* http://www.apache.org/licenses/LICENSE-2.0
@*
@* Unless required by applicable law or agreed to in writing, software
@* distributed under the License is distributed on an "AS IS" BASIS,
@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@* See the License for the specific language governing permissions and
@* limitations under the License.
@*
@*****************************************************************************/
@/*******************************************************************************
@* @file
@* ihevc_deblk_luma_horz.s
@*
@* @brief
@* contains function definitions for inter prediction interpolation.
@* functions are coded using neon intrinsics and can be compiled using
@* rvct
@*
@* @author
@* anand s
@*
@* @par list of functions:
@*
@*
@* @remarks
@* none
@*
@*******************************************************************************/
.text
.align 4
.extern gai4_ihevc_qp_table
.extern gai4_ihevc_tc_table
.globl ihevc_deblk_chroma_horz_a9q
gai4_ihevc_qp_table_addr:
.long gai4_ihevc_qp_table - ulbl1 - 8
gai4_ihevc_tc_table_addr:
.long gai4_ihevc_tc_table - ulbl2 - 8
.type ihevc_deblk_chroma_horz_a9q, %function
ihevc_deblk_chroma_horz_a9q:
push {r4-r12,lr}
sub r12,r0,r1
vld1.8 {d0},[r0]
sub r5,r12,r1
add r6,r0,r1
add r1,r2,r3
vmovl.u8 q0,d0
ldr r10,[sp,#0x28]
vld1.8 {d2},[r12]
add r2,r1,#1
ldr r4,[sp,#0x30]
vld1.8 {d4},[r5]
ldr r8,[sp,#0x34]
vld1.8 {d16},[r6]
ldr r9,[sp,#0x38]
adds r1,r10,r2,asr #1
vmovl.u8 q1,d2
ldr r7,[sp,#0x2c]
ldr r3,gai4_ihevc_qp_table_addr
ulbl1:
add r3, r3, pc
bmi l1.3312
cmp r1,#0x39
ldrle r1,[r3,r1,lsl #2]
subgt r1,r1,#6
l1.3312:
adds r2,r7,r2,asr #1
vmovl.u8 q2,d4
bmi l1.3332
cmp r2,#0x39
ldrle r2,[r3,r2,lsl #2]
subgt r2,r2,#6
l1.3332:
add r1,r1,r4,lsl #1
vsub.i16 q3,q0,q1
add r3,r1,#2
cmp r3,#0x35
movgt r1,#0x35
vshl.i16 q3,q3,#2
vmovl.u8 q8,d16
bgt l1.3368
adds r3,r1,#2
addpl r1,r1,#2
movmi r1,#0
l1.3368:
ldr r3,gai4_ihevc_tc_table_addr
ulbl2:
add r3, r3, pc
vadd.i16 q2,q3,q2
add r2,r2,r4,lsl #1
vsub.i16 q3,q2,q8
add r4,r2,#2
ldr r1,[r3,r1,lsl #2]
cmp r4,#0x35
movgt r2,#0x35
bgt l1.3412
adds r4,r2,#2
addpl r2,r2,#2
movmi r2,#0
l1.3412:
ldr r2,[r3,r2,lsl #2]
cmp r8,#0
vdup.16 q8,r2
vdup.16 q2,r1
rsb r1,r1,#0
vrshr.s16 q3,q3,#3
vdup.16 q9,r1
rsb r1,r2,#0
vzip.16 q2,q8
vdup.16 q10,r1
vzip.16 q9,q10
vmin.s16 q8,q3,q2
vmax.s16 q2,q9,q8
vadd.i16 q1,q1,q2
vsub.i16 q0,q0,q2
vqmovun.s16 d2,q1
vqmovun.s16 d0,q0
beq l1.3528
vst1.8 {d2},[r12]
l1.3528:
cmp r9,#0
beq l1.3540
vst1.8 {d0},[r0]
l1.3540:
pop {r4-r12,pc}
@/*****************************************************************************
@*
@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
@*
@* Licensed under the Apache License, Version 2.0 (the "License");
@* you may not use this file except in compliance with the License.
@* You may obtain a copy of the License at:
@*
@* http://www.apache.org/licenses/LICENSE-2.0
@*
@* Unless required by applicable law or agreed to in writing, software
@* distributed under the License is distributed on an "AS IS" BASIS,
@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@* See the License for the specific language governing permissions and
@* limitations under the License.
@*
@*****************************************************************************/
@/**
@/*******************************************************************************
@* @file
@* ihevc_deblk_luma_vert.s
@*
@* @brief
@* contains function definitions for inter prediction interpolation.
@* functions are coded using neon intrinsics and can be compiled using
@* rvct
@*
@* @author
@* anand s
@*
@* @par list of functions:
@*
@*
@* @remarks
@* none
@*
@*******************************************************************************/
.text
.align 4
.extern gai4_ihevc_qp_table
.extern gai4_ihevc_tc_table
.globl ihevc_deblk_chroma_vert_a9q
gai4_ihevc_qp_table_addr:
.long gai4_ihevc_qp_table - ulbl1 - 8
gai4_ihevc_tc_table_addr:
.long gai4_ihevc_tc_table - ulbl2 - 8
.type ihevc_deblk_chroma_vert_a9q, %function
ihevc_deblk_chroma_vert_a9q:
push {r4-r12,lr}
sub r8,r0,#4
add r2,r2,r3
vld1.8 {d5},[r8],r1
add r2,r2,#1
vld1.8 {d17},[r8],r1
ldr r7,[sp,#0x28]
vld1.8 {d16},[r8],r1
ldr r4,[sp,#0x38]
vld1.8 {d4},[r8]
ldr r5,[sp,#0x30]
vtrn.8 d5,d17
adds r3,r7,r2,asr #1
vtrn.8 d16,d4
ldr r7,gai4_ihevc_qp_table_addr
ulbl1:
add r7,r7,pc
ldr r12,[sp,#0x34]
ldr r6,[sp,#0x2c]
bmi l1.2944
cmp r3,#0x39
ldrle r3,[r7,r3,lsl #2]
subgt r3,r3,#6
l1.2944:
vtrn.16 d5,d16
adds r2,r6,r2,asr #1
vtrn.16 d17,d4
bmi l1.2964
cmp r2,#0x39
ldrle r2,[r7,r2,lsl #2]
subgt r2,r2,#6
l1.2964:
vtrn.32 d5,d17
add r3,r3,r5,lsl #1
vtrn.32 d16,d4
add r6,r3,#2
vmovl.u8 q9,d17
cmp r6,#0x35
movgt r3,#0x35
bgt l1.2996
adds r6,r3,#2
addpl r3,r3,#2
movmi r3,#0
l1.2996:
vsubl.u8 q0,d17,d16
ldr r6,gai4_ihevc_tc_table_addr
ulbl2:
add r6,r6,pc
vshl.i16 q0,q0,#2
add r2,r2,r5,lsl #1
add r5,r2,#2
vaddw.u8 q0,q0,d5
cmp r5,#0x35
ldr r3,[r6,r3,lsl #2]
vsubw.u8 q2,q0,d4
movgt r2,#0x35
bgt l1.3036
adds r5,r2,#2
addpl r2,r2,#2
movmi r2,#0
l1.3036:
vrshr.s16 q3,q2,#3
vdup.16 d2,r3
ldr r2,[r6,r2,lsl #2]
rsb r3,r3,#0
cmp r12,#0
vdup.16 d3,r2
rsb r2,r2,#0
vdup.16 d30,r3
vdup.16 d31,r2
vmin.s16 q2,q3,q1
vmax.s16 q1,q15,q2
vmovl.u8 q3,d16
vadd.i16 q0,q3,q1
vsub.i16 q1,q9,q1
vqmovun.s16 d0,q0
sub r2,r0,#2
vqmovun.s16 d1,q1
vtrn.32 d0,d1
vtrn.8 d0,d1
beq l1.3204
vst1.16 {d0[0]},[r2],r1
vst1.16 {d1[0]},[r2],r1
vst1.16 {d0[1]},[r2],r1
vst1.16 {d1[1]},[r2]
l1.3204:
cmp r4,#0
beq l1.3228
vst1.16 {d0[2]},[r0],r1
vst1.16 {d1[2]},[r0],r1
vst1.16 {d0[3]},[r0],r1
vst1.16 {d1[3]},[r0]
l1.3228:
pop {r4-r12,pc}
@/*****************************************************************************
@*
@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
@*
@* Licensed under the Apache License, Version 2.0 (the "License");
@* you may not use this file except in compliance with the License.
@* You may obtain a copy of the License at:
@*
@* http://www.apache.org/licenses/LICENSE-2.0
@*
@* Unless required by applicable law or agreed to in writing, software
@* distributed under the License is distributed on an "AS IS" BASIS,
@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@* See the License for the specific language governing permissions and
@* limitations under the License.
@*
@*****************************************************************************/
@/*******************************************************************************
@* @file
@* ihevc_deblk_luma_vert.s
@*
@* @brief
@* contains function definitions for inter prediction interpolation.
@* functions are coded using neon intrinsics and can be compiled using
@* rvct
@*
@* @author
@* anand s
@*
@* @par list of functions:
@*
@*
@* @remarks
@* none
@*
@*******************************************************************************/
.text
.align 4
.extern gai4_ihevc_tc_table
.extern gai4_ihevc_beta_table
.globl ihevc_deblk_luma_horz_a9q
gai4_ihevc_tc_table_addr:
.long gai4_ihevc_tc_table - ulbl1 - 8
gai4_ihevc_beta_table_addr:
.long gai4_ihevc_beta_table - ulbl2 - 8
.type ihevc_deblk_luma_horz_a9q, %function
ihevc_deblk_luma_horz_a9q:
stmfd sp!, {r3-r12,lr}
ldr r4,[sp,#0x2c]
ldr r5,[sp,#0x30]
add r3,r3,r4
add r3,r3,#1
ldr r6, [sp,#0x34]
asr r3,r3,#1
add r7,r3,r5,lsl #1
add r3,r3,r6,lsl #1
cmp r7,#0x33
movgt r7,#0x33
bgt l1.1532
cmp r7,#0x0
movlt r7,#0x0 @ r7 has the beta_index value
l1.1532:
@ bic r2,r2,#1
asr r2,r2,#1
add r3,r3,r2,lsl #1
cmp r3,#0x35
movgt r3,#0x35
bgt l1.1564
cmp r3,#0x0
movlt r3,#0x0 @ r3 has the tc_index value
@ qp_luma = (quant_param_p + quant_param_q + 1) >> 1@
@ beta_indx = clip3(qp_luma + (beta_offset_div2 << 1), 0, 51)@
@ tc_indx = clip3(qp_luma + (2 * (bs >> 1)) + (tc_offset_div2 << 1), 0, 53)@
l1.1564:
ldr r2,gai4_ihevc_beta_table_addr
ulbl2:
add r2,r2,pc
ldr r4,gai4_ihevc_tc_table_addr
ulbl1:
add r4,r4,pc
ldr r5,[r2,r7,lsl #2] @ beta
ldr r6,[r4,r3,lsl #2] @ tc
cmp r6,#0
beq l1.2404
vmov.i16 d0,#0x2
lsl r7,r6,#1
add r14,r1,r1,lsl #1
ldr r8,[r0,-r14] @ -3 value
vdup.8 d1,r7
ldr r10,[r0,-r1,lsl #1] @-2 value
vdup.32 d23,r8 @ -3 value
ldr r11,[r0,-r1] @-1 value
vdup.32 d24,r10 @ -2 value
and r8,#0xff
ldr r12,[r0,#0] @ 0 value
vdup.32 d25, r11 @-1 value
and r10,#0xff
ldr r9,[r0,r1] @ 1 value
vdup.32 d26,r12 @ 0 value
and r11,#0xff
ldr r2,[r0,r1,lsl #1] @ 2 value
vdup.32 d27,r9 @ 1value
and r12,#0xff
vdup.32 d28,r2 @ 2 value
and r9,#0xff
and r2,#0xff
add r12,r12,r2
subs r9,r12,r9,lsl #1 @ dq0 value is stored in r9
rsbmi r9,r9,#0
@dq0 = abs( pu1_src[2] - 2 * pu1_src[1] + pu1_src[0] )@
add r8,r8,r11
subs r8,r8,r10,lsl #1
rsbmi r8,r8,#0 @ dp0 value is stored in r8
@ dp0 = abs( pu1_src[-3] - 2 * pu1_src[-2] + pu1_src[-1] )@
add r3,r1,r1,lsl #1
add r14,r0,#3
ldrb r2,[r14,-r3] @ -2 value
ldrb r10,[r14,-r1,lsl #1] @ -2 value
ldrb r11,[r14,-r1] @ -1 value
ldrb r12,[r14,#0] @ 0 value
ldrb r3,[r14,r1] @ 1 value
ldrb r4,[r14,r1,lsl #1] @ 2 value
add r12,r12,r4
subs r12,r12,r3,lsl #1 @ dq3value is stored in r12
rsbmi r12,r12,#0
@ dq3 = abs( pu1_src[3 * src_strd + 2] - 2 * pu1_src[3 * src_strd + 1]+ pu1_src[3 * src_strd + 0] )@
add r2,r2,r11
subs r11,r2,r10,lsl #1
rsbmi r11,r11,#0 @ dp3 value is stored in r8
@ dp3 = abs( pu1_src[3 * src_strd - 3] - 2 * pu1_src[3 * src_strd - 2] + pu1_src[3 * src_strd - 1] )@
add r3,r8,r9 @ r3 has the d0 value
add r4,r11,r12 @ r4 has the d3 value
@ d0 = dp0 + dq0@
@ d3 = dp3 + dq3@
add r14,r8,r11 @ r13 has the value dp
add r12,r12,r9 @ r12 has the value dq
@ dp = dp0 + dp3@
@ dq = dq0 + dq3@
add r11, r3, r4 @ r3 has the value d
@ d = d0 + d3@
cmp r11,r5
bge l1.2404
@ if(d < beta)
@ registers which cannont be altered : r3,r4 r5,r6,r12,r13,r0,r1,r11
@ registers for use: r2,r7,r8,r9,r10,
asr r10,r5,#2
vqadd.u8 d30,d26,d1
cmp r10,r3,lsl #1
vqsub.u8 d31,d26,d1
ble l1.1840
add r10,r1,r1,lsl #1
vaddl.u8 q3,d25,d26
ldr r2,[r0,-r1,lsl #2] @ has the -4 value
ldrb r7,[r0,-r1] @ has the -1 value
vdup.32 d22,r2 @ -4 value
vaddw.u8 q4,q3,d27
ldrb r3,[r0,#0] @ r4 has the 0 value
vqadd.u8 d16,d27,d1
and r2,#0xff
vmul.i16 q6,q4,d0[0]
ldr r8,[r0,r10] @ has the 3 value
vaddl.u8 q5,d24,d28
subs r2,r2,r7
vqsub.u8 d17,d27,d1
vdup.32 d29,r8 @ 3 value
and r8,#0xff
vadd.i16 q6,q6,q5
rsbmi r2,r2,#0
vrshrn.i16 d20,q6,#3
subs r8,r8,r3
rsbmi r8,r8,#0
vmin.u8 d18,d20,d30
add r8,r8,r2
cmp r8,r5,asr #3
bge l1.1840
vaddw.u8 q7,q4,d28
subs r7,r3,r7
vmax.u8 d4,d18,d31
rsbmi r7,r7,#0
vqadd.u8 d30,d28,d1
mov r10,#5
vrshrn.i16 d21,q7,#2
mul r10,r10,r6
vqsub.u8 d31,d28,d1
add r10,#1
cmp r7,r10,asr #1
vmin.u8 d18,d21,d16
bge l1.1840
@ if( (2 * d3 < (beta >> 2)&& ( abs(pu1_src[3] - pu1_src[0]) + abs(pu1_src[-1] - pu1_src[-4]) < (beta >> 3) )
@ && abs(pu1_src[0] - pu1_src[-1]) < ( (5 * tc + 1) >> 1 ) )
vmax.u8 d5,d18,d17
asr r10,r5,#2
vaddl.u8 q8,d29,d28
cmp r10,r4,lsl #1
ble l1.1840
add r10,r1,r1,lsl #1
vmul.i16 q8,q8,d0[0]
add r4,r0,#3
ldrb r2,[r4,-r1,lsl #2]
vadd.i16 q8,q8,q7
ldrb r7,[r4,-r1]
vrshrn.i16 d19,q8,#3
ldrb r3,[r4,#0]
ldrb r8,[r4,r10]
@ ubfx r7,r2,#24,#8 @ has the -1 value
@ and r2,#0xff @ has the -4 value
@ ubfx r8,r3,#24,#8 @ has the 3 value
@ and r3,#0xff @ r4 has the 0 value
subs r8,r8,r3
vmin.u8 d18,d19,d30
rsbmi r8,r8,#0
vaddl.u8 q3,d25,d24
subs r2,r2,r7
vmax.u8 d3,d18,d31
rsbmi r2,r2,#0
vaddw.u8 q4,q3,d26
add r8,r8,r2
vqadd.u8 d30,d25,d1
cmp r8,r5,asr #3
vqsub.u8 d31,d25,d1
bge l1.1840
vmul.i16 q6,q4,d0[0]
subs r7,r3,r7
vqadd.u8 d16,d24,d1
rsbmi r7,r7,#0
vaddl.u8 q5,d23,d27
mov r10,#5
vqsub.u8 d17,d24,d1
mul r10,r10,r6
vadd.i16 q6,q6,q5
add r10,#1
vrshrn.i16 d20,q6,#3
cmp r7,r10,asr #1
vaddw.u8 q7,q4,d23
bge l1.1840
vmin.u8 d18,d20,d30
mov r2,#2
vqadd.u8 d30,d23,d1
ldr r4,[sp,#0x38] @ loading the filter_flag_p
vmax.u8 d2,d18,d31
ldr r5,[sp,#0x3c] @ loading the filter_flag_q
vrshrn.i16 d21,q7,#2
b end_dep_deq_decision_horz
@ r2 has the value of de
@ r6 has teh value of tc
@ r5 has the value of beta
@ r14 has the value of dp
@ r12 has the value of dq
@ r0 has the value of source address
@ r1 has the src stride
l1.1840:
mov r2,#1
mov r11,r5
ldr r4,[sp,#0x38] @ loading the filter_flag_p
ldr r5,[sp,#0x3c] @ loading the filter_flag_q
cmp r6,#1
moveq r9,#0
moveq r10,#0
beq end_dep_deq_decision_horz
and r7,r4,r5
cmp r7,#1
beq both_flags_set_horz
cmp r4,#0
beq set_flag_dep_zero_horz
add r8,r11,r11,asr #1
mov r10,#0
asr r8,#3
cmp r8,r14
movgt r9,#1
movle r9,#0
b end_dep_deq_decision_horz
set_flag_dep_zero_horz:
add r8,r11,r11,asr #1
mov r9,#0
asr r8,#3
cmp r8,r12
movgt r10,#1
movle r10,#0
b end_dep_deq_decision_horz
both_flags_set_horz:
add r8,r11,r11,asr #1
asr r8,#3
cmp r8,r14
movgt r9,#1
movle r9,#0
cmp r8,r12
movgt r10,#1
movle r10,#0
end_dep_deq_decision_horz:
@r0=source address
@r1=stride
@ r2 =de
@ r4=flag p
@r5= flag q
@r6 =tc
@ r9 =dep
@ r10=deq
@ add r14,r1,r1,lsl #1
@ lsl r7,r6,#1
@ vdup.8 d1,r7
@ vmov.i16 d0,#0x2
vmin.u8 d18,d21,d16
cmp r2,#1
vqsub.u8 d31,d23,d1
beq l1.2408
vaddl.u8 q4,d23,d22
cmp r5,#1
bne strong_filtering_p
strong_filtering_q:
mov r12,r0
vst1.32 d4[0],[r12],r1
vst1.32 d5[0],[r12],r1
vst1.32 d3[0],[r12]
cmp r4,#1
bne l1.2404
strong_filtering_p:
vmax.u8 d5,d18,d17
mov r12,r0
vmul.i16 q4,q4,d0[0]
rsb r11,r1,#0
vadd.i16 q8,q4,q7
add r12,r12,r11
vrshrn.i16 d19,q8,#3
vst1.32 d2[0],[r12],r11
vmin.u8 d18,d19,d30
vst1.32 d5[0],[r12],r11
vmax.u8 d3,d18,d31
vst1.32 d3[0],[r12]
l1.2404:
ldmfd sp!, {r3-r12,pc}
@ r4=flag p
@r5= flag q
@r6 =tc
@ r9 =dep
@ r10=deq
@ d22 -4 value
@d23 @ -3 value
@ vdup.32 d24,r11 @ -2 value
@ vdup.32 d25, r11 @-1 value
@ vdup.32 d26,r11 @ 0 value
@ vdup.32 d27,r11 @ 1value
@ vdup.32 d28,r11 @ 2 value
@ vdup.32 d29,r11 @ 3 value
l1.2408:
vmov.i16 d0,#0x9
vsubl.u8 q5,d26,d25
vmul.i16 q5,q5,d0[0]
vmov.i16 d0,#0x3
vsubl.u8 q6,d27,d24
vmul.i16 q6,q6,d0[0]
vdup.8 d30,r6 @ duplicating the +tc value
rsb r12,r6,#0
vdup.8 d31,r12 @ duplicating the -tc value
vsub.i16 q5,q5,q6
vrshr.s16 q5,q5,#4
@ delta = ( 9 * (pu1_src[0] - pu1_src[-1]) - 3 * (pu1_src[1] - pu1_src[-2]) + 8 ) >> 4@
vabs.s16 q4,q5
vmovn.i16 d9,q4
@ storing the absolute values of delta in d9
vqmovn.s16 d10,q5
@ storing the clipped values of delta in d16
vmin.s8 d11,d10,d30
vmax.s8 d8,d31,d11 @ d8 has the value delta = clip3(delta, -tc, tc)@
vmovl.u8 q3,d25
vaddw.s8 q2,q3,d8
vqmovun.s16 d12,q2
vmovl.u8 q3,d26
vsubw.s8 q2,q3,d8
vqmovun.s16 d13,q2
mov r11,#0xa
mul r12,r11,r6
vdup.8 d2,r12 @ d2 has the 10*tc value
vmov d18,d24
vdup.8 d0,r6
vshr.s8 d0,#1
vneg.s8 d1,d0
cmp r4,#1
bne l1.2724
cmp r9,#1
bne l1.2700
@ d12 and d13 have the value temp_p0 and temp_q0
vaddl.u8 q7,d23,d25
vrshrn.u16 d14,q7,#1
vsubl.u8 q7,d14,d24
vaddw.s8 q7,q7,d8
vqshrn.s16 d14,q7,#1
vmin.s8 d15,d14,d0
vmax.s8 d14,d1,d15
@ d14 has the delta p value
vmovl.u8 q8,d24
vaddw.s8 q8,q8,d14
vqmovun.s16 d14,q8
@ d14 =tmp_p1 = clip_u8(pu1_src[-2 * src_strd] + delta_p)@
vcge.u8 d18,d9,d2
vbsl d18,d24,d14
l1.2700:
mov r12,r0
rsb r11,r1,#0
add r12,r11
vcge.u8 d19,d9,d2
vbsl d19,d25,d12
vst1.32 {d19[0]},[r12],r11
vst1.32 {d18[0]},[r12]
l1.2724:
cmp r5,#1
bne l1.2404
cmp r10,#1
vmov d18, d27
bne l1.2852
vaddl.u8 q7,d26,d28
vrshrn.u16 d14,q7,#1
vsubl.u8 q7,d14,d27
vsubw.s8 q7,q7,d8
vqshrn.s16 d14,q7,#1
vmin.s8 d15,d14,d0
vmax.s8 d14,d1,d15
@ d14 has the delta p value
vmovl.u8 q8,d27
vaddw.s8 q8,q8,d14
vqmovun.s16 d14,q8
vcge.u8 d18,d9,d2
vbsl d18,d27,d14
l1.2852:
mov r12,r0
vcge.u8 d19,d9,d2
vbsl d19,d26,d13
vst1.32 {d19[0]},[r12],r1
vst1.32 {d18[0]},[r12]
ldmfd sp!, {r3-r12,r15}
@/*****************************************************************************
@*
@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
@*
@* Licensed under the Apache License, Version 2.0 (the "License");
@* you may not use this file except in compliance with the License.
@* You may obtain a copy of the License at:
@*
@* http://www.apache.org/licenses/LICENSE-2.0
@*
@* Unless required by applicable law or agreed to in writing, software
@* distributed under the License is distributed on an "AS IS" BASIS,
@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@* See the License for the specific language governing permissions and
@* limitations under the License.
@*
@*****************************************************************************/
@/**
@/*******************************************************************************
@* @file
@* ihevc_deblk_luma_vert.s
@*
@* @brief
@* contains function definitions for inter prediction interpolation.
@* functions are coded using neon intrinsics and can be compiled using
@* rvct
@*
@* @author
@* anand s
@*
@* @par list of functions:
@*
@*
@* @remarks
@* none
@*
@*******************************************************************************/
.text
.align 4
.extern gai4_ihevc_tc_table
.extern gai4_ihevc_beta_table
.globl ihevc_deblk_luma_vert_a9q
gai4_ihevc_tc_table_addr:
.long gai4_ihevc_tc_table - ulbl1 - 8
gai4_ihevc_beta_table_addr:
.long gai4_ihevc_beta_table - ulbl2 - 8
.type ihevc_deblk_luma_vert_a9q, %function
ihevc_deblk_luma_vert_a9q:
push {r3-r12,lr}
ldr r4,[sp,#0x2c]
ldr r5,[sp,#0x30]
add r3,r3,r4
add r3,r3,#1
ldr r6, [sp,#0x34]
asr r3,r3,#1
add r7,r3,r5,lsl #1
add r3,r3,r6,lsl #1
cmp r7,#0x33
movgt r7,#0x33
bgt l1.56
cmp r7,#0x0
movlt r7,#0x0 @ r7 has the beta_index value
l1.56:
@ bic r2,r2,#1
asr r2,r2,#1
add r3,r3,r2,lsl #1
cmp r3,#0x35
movgt r3,#0x35
bgt l1.88
cmp r3,#0x0
movlt r3,#0x0 @ r3 has the tc_index value
@ qp_luma = (quant_param_p + quant_param_q + 1) >> 1@
@ beta_indx = clip3(qp_luma + (beta_offset_div2 << 1), 0, 51)@
@ tc_indx = clip3(qp_luma + (2 * (bs >> 1)) + (tc_offset_div2 << 1), 0, 53)@
l1.88:
ldr r2,gai4_ihevc_beta_table_addr
ulbl2:
add r2,r2,pc
vmov.i8 d18,#0x2
ldr r4,gai4_ihevc_tc_table_addr
ulbl1:
add r4,r4,pc
ldr r5,[r2,r7,lsl #2] @ beta
vmov.i16 q8,#0x2
ldr r6,[r4,r3,lsl #2] @ tc
lsl r8,r6,#1
cmp r6,#0
vdup.8 d19,r8
sub r7,r0,#4
vmov.i8 d23,#0x3
beq l1.964
vld1.8 {d24},[r7],r1
ldrb r8,[r0,#-3] @ -3 value
vld1.8 {d1},[r7],r1
ldrb r10,[r0,#-2] @-2 value
vld1.8 {d2},[r7],r1
ldrb r11,[r0,#-1] @-1 value
vld1.8 {d0},[r7]
ldrb r12,[r0,#0] @ 0 value
ldrb r9,[r0,#1] @ 1 value
vtrn.8 d24,d1
ldrb r2,[r0,#2] @ 2 value
vtrn.8 d2,d0
add r12,r12,r2
subs r9,r12,r9,lsl #1 @ dq0 value is stored in r9
rsbmi r9,r9,#0
@dq0 = abs( pu1_src[2] - 2 * pu1_src[1] + pu1_src[0] )@
vtrn.16 d24,d2
add r8,r8,r11
vtrn.16 d1,d0
subs r8,r8,r10,lsl #1
rsbmi r8,r8,#0 @ dp0 value is stored in r8
@ dp0 = abs( pu1_src[-3] - 2 * pu1_src[-2] + pu1_src[-1] )@
add r14,r1,r1,lsl #1
add r14,r0,r14
vdup.32 d4,d24[1]
ldrb r2,[r14,#-3] @ -2 value
vdup.32 d7,d2[1]
ldrb r10,[r14,#-2] @ -2 value
vdup.32 d3,d2[0]
ldrb r11,[r14,#-1] @ -1 value
vdup.32 d5,d1[1]
ldrb r12,[r14,#0] @ 0 value
vdup.32 d6,d1[0]
ldrb r3,[r14,#1] @ 1 value
vdup.32 d2,d0[0]
ldrb r4,[r14,#2] @ 2 value
add r12,r12,r4
subs r12,r12,r3,lsl #1 @ dq3value is stored in r12
rsbmi r12,r12,#0
@ dq3 = abs( pu1_src[3 * src_strd + 2] - 2 * pu1_src[3 * src_strd + 1]+ pu1_src[3 * src_strd + 0] )@
add r2,r2,r11
subs r11,r2,r10,lsl #1
rsbmi r11,r11,#0 @ dp3 value is stored in r8
@ dp3 = abs( pu1_src[3 * src_strd - 3] - 2 * pu1_src[3 * src_strd - 2] + pu1_src[3 * src_strd - 1] )@
add r3,r8,r9 @ r3 has the d0 value
add r4,r11,r12 @ r4 has the d3 value
@ d0 = dp0 + dq0@
@ d3 = dp3 + dq3@
add r14,r8,r11 @ r13 has the value dp
add r12,r12,r9 @ r12 has the value dq
@ dp = dp0 + dp3@
@ dq = dq0 + dq3@
add r11, r3, r4 @ r3 has the value d
@ d = d0 + d3@
cmp r11,r5
vdup.32 d22,d0[1]
bge l1.964
@ if(d < beta)
@ registers which cannont be altered : r3,r4 r5,r6,r12,r13,r0,r1,r11
@ registers for use: r2,r7,r8,r9,r10,
vqsub.u8 d30,d7,d19
asr r10,r5,#2
vqadd.u8 d31,d7,d19
cmp r10,r3,lsl #1
vaddl.u8 q0,d5,d4
ble l1.336
ldrb r2,[r0,#-4]
vaddw.u8 q0,q0,d2
ldrb r7,[r0,#-1]
vmull.u8 q10,d7,d23
ldrb r3,[r0,#0]
vmlal.u8 q10,d22,d18
ldrb r8,[r0,#3]
@ ubfx r7,r2,#24,#8 @ has the -1 value
@ and r2,#0xff @ has the -4 value
@ ubfx r8,r3,#24,#8 @ has the 3 value
@ and r3,#0xff @ r4 has the 0 value
vadd.i16 q10,q10,q0
subs r8,r8,r3
vrshrn.i16 d22,q10,#3
rsbmi r8,r8,#0
subs r2,r2,r7
vmin.u8 d21,d22,d31
rsbmi r2,r2,#0
vmax.u8 d22,d21,d30
add r8,r8,r2
vaddl.u8 q10,d7,d3
cmp r8,r5,asr #3
vmla.i16 q10,q0,q8
bge l1.336
vaddw.u8 q0,q0,d7
subs r7,r3,r7
vrshrn.i16 d20,q10,#3
rsbmi r7,r7,#0
vrshrn.i16 d0,q0,#2
mov r10,#5
vqadd.u8 d30,d5,d19
mul r10,r10,r6
vqsub.u8 d31,d5,d19
add r10,#1
cmp r7,r10,asr #1
bge l1.336
@ if( (2 * d3 < (beta >> 2)&& ( abs(pu1_src[3] - pu1_src[0]) + abs(pu1_src[-1] - pu1_src[-4]) < (beta >> 3) )
@ && abs(pu1_src[0] - pu1_src[-1]) < ( (5 * tc + 1) >> 1 ) )
asr r10,r5,#2
vqsub.u8 d25,d4,d19
cmp r10,r4,lsl #1
vqadd.u8 d21,d4,d19
ble l1.336
vmin.u8 d26,d20,d21
add r4,r1,r1,lsl #1
add r4,r4,r0
vmax.u8 d20,d26,d25
ldrb r2,[r4,#-4]
vmin.u8 d19,d0,d30
ldrb r7,[r4,#-1]
vmax.u8 d21,d19,d31
ldrb r3,[r4,#0]
lsl r10,r6,#1
ldrb r8,[r4,#3]
@ ubfx r7,r2,#24,#8 @ has the -1 value
@ and r2,#0xff @ has the -4 value
@ ubfx r8,r3,#24,#8 @ has the 3 value
@ and r3,#0xff @ r4 has the 0 value
vaddl.u8 q0,d2,d3
vdup.8 d19,r10
subs r8,r8,r3
vaddw.u8 q0,q0,d4
rsbmi r8,r8,#0
vqadd.u8 d30,d2,d19
subs r2,r2,r7
vqsub.u8 d31,d2,d19
rsbmi r2,r2,#0
vaddl.u8 q13,d5,d6
add r8,r8,r2
vmla.i16 q13,q0,q8
cmp r8,r5,asr #3
bge l1.336
vrshrn.i16 d26,q13,#3
subs r7,r3,r7
vqadd.u8 d27,d3,d19
rsbmi r7,r7,#0
vqsub.u8 d28,d3,d19
mov r10,#5
vmin.u8 d16,d26,d30
mul r10,r10,r6
add r10,#1
cmp r7,r10,asr #1
vmax.u8 d26,d16,d31
bge l1.336
vqadd.u8 d30,d6,d19
mov r2,#2
ldr r4,[sp,#0x38] @ loading the filter_flag_p
vqsub.u8 d31,d6,d19
ldr r5,[sp,#0x3c] @ loading the filter_flag_q
b end_dep_deq_decision
@ r2 has the value of de
@ r6 has teh value of tc
@ r5 has the value of beta
@ r14 has the value of dp
@ r12 has the value of dq
@ r0 has the value of source address
@ r1 has the src stride
l1.336:
mov r2,#1
l1.424:
mov r11,r5
ldr r4,[sp,#0x38] @ loading the filter_flag_p
ldr r5,[sp,#0x3c] @ loading the filter_flag_q
cmp r6,#1
moveq r9,#0
moveq r10,#0
beq end_dep_deq_decision
and r7,r4,r5
cmp r7,#1
beq both_flags_set
cmp r4,#0
beq set_flag_dep_zero
add r8,r11,r11,asr #1
mov r10,#0
asr r8,#3
cmp r8,r14
movgt r9,#1
movle r9,#0
b end_dep_deq_decision
set_flag_dep_zero:
add r8,r11,r11,asr #1
mov r9,#0
asr r8,#3
cmp r8,r12
movgt r10,#1
movle r10,#0
b end_dep_deq_decision
both_flags_set:
add r8,r11,r11,asr #1
asr r8,#3
cmp r8,r14
movgt r9,#1
movle r9,#0
cmp r8,r12
movgt r10,#1
movle r10,#0
end_dep_deq_decision:
@r0=source address
@r1=stride
@ r2 =de
@ r4=flag p
@r5= flag q
@r6 =tc
@ r9 =dep
@ r10=deq
@ b l1.964
cmp r2,#2
@ r4 has the value of de
bne l1.968
cmp r5,#0
beq l1.780
@ r5 has the flag of q
add r3,r0,#2
vst1.8 {d22[0]},[r3],r1
vst1.8 {d22[1]},[r3],r1
vst1.8 {d22[2]},[r3],r1
vst1.8 {d22[3]},[r3]
add r3,r0,r1
vtrn.8 d20,d21
vst1.16 {d20[0]},[r0]
vst1.16 {d21[0]},[r3],r1
vst1.16 {d20[1]},[r3],r1
vst1.16 {d21[1]},[r3]
l1.780:
cmp r4,#0
beq l1.964
@ r5 has the flag p
vdup.32 d7,d24[0]
sub r3,r0,#1
vaddw.u8 q8,q0,d6
add r7,r3,r1
vrshrn.i16 d2,q8,#2
vst1.8 {d26[0]},[r3]
sub r0,r0,#3
vmin.u8 d16,d2,d27
vst1.8 {d26[1]},[r7],r1
vmull.u8 q1,d6,d23
vmlal.u8 q1,d7,d18
vst1.8 {d26[2]},[r7],r1
vmax.u8 d5,d16,d28
vst1.8 {d26[3]},[r7]
vadd.i16 q0,q1,q0
vrshrn.i16 d0,q0,#3
vmin.u8 d1,d0,d30
vmax.u8 d0,d1,d31
vtrn.8 d0,d5
vst1.16 {d0[0]},[r0],r1
vst1.16 {d5[0]},[r0],r1
vst1.16 {d0[1]},[r0],r1
vst1.16 {d5[1]},[r0]
l1.964:
pop {r3-r12,pc}
l1.968:
vmov.i16 q0,#0x9
rsb r11,r6,#0
cmp r4,#0
@ checks for the flag p
vmov.i16 q8,#0x3
vmov.i8 d24,#0x1
vdup.8 d30,r11
and r11,r6,#0xff
vdup.8 d31,r11
vsubl.u8 q9,d4,d2
vmul.i16 q9,q9,q0
vsubl.u8 q0,d5,d3
vmul.i16 q8,q0,q8
vsub.i16 q8,q9,q8
vrshr.s16 q8,q8,#4
@ delta = ( 9 * (pu1_src[0] - pu1_src[-1]) - 3 * (pu1_src[1] - pu1_src[-2]) + 8 ) >> 4@
vabs.s16 q0,q8
vmovn.i16 d0,q0
@ storing the absolute values of delta in d0
vqmovn.s16 d16,q8
@ storing the clipped values of delta in d16
vmov.i8 d1,#0xa
vdup.8 d21,r11
vmul.i8 d1,d1,d21
@ d1 stores the value (10 * tc)
@if(abs(delta) < 10 * tc)
vmin.s8 d18,d16,d31
vmax.s8 d20,d18,d30
@ delta = clip3(delta, -tc, tc)@
vmovl.s8 q8,d20
vmovl.u8 q9,d2
vadd.i16 q9,q9,q8
vqmovun.s16 d22,q9
vmovl.u8 q9,d4
vsub.i16 q8,q9,q8
vqmovun.s16 d23,q8
@ tmp_p0 = clip_u8(pu1_src[-1] + delta)@
@ tmp_q0 = clip_u8(pu1_src[0] - delta)@
beq l1.1272
cmp r9,#1
bne l1.1212
@ checks for the flag dep
asr r3,r6,#1
vaddl.u8 q8,d6,d2
vaddw.u8 q8,q8,d24
vdup.8 d18,r3
rsb r3,r3,#0
vdup.8 d19,r3
vshr.u16 q8,q8,#1
vmovn.i16 d16,q8
vsubl.u8 q8,d16,d3
vaddw.s8 q8,q8,d20
vshr.s16 q8,q8,#1
vqmovn.s16 d16,q8
vmin.s8 d17,d16,d18
vmax.s8 d16,d19,d17
vmovl.u8 q9,d3
vmovl.s8 q8,d16
vadd.i16 q8,q9,q8
vqmovun.s16 d16,q8
vmov d30,d3
vcge.u8 d3,d0,d1
vbsl d3,d30,d16
l1.1212:
vdup.8 d16,r11
sub r12,r0,#3
sub r3,r0,#1
@ vmul.i8 d16,d16,d1
vtrn.8 d6,d3
vst1.16 {d6[0]},[r12],r1
vcge.u8 d16,d0,d1
vst1.16 {d3[0]},[r12],r1
vbsl d16,d2,d22
vst1.8 {d16[0]},[r3],r1
vst1.8 {d16[1]},[r3],r1
vst1.16 {d6[1]},[r12],r1
vst1.8 {d16[2]},[r3],r1
vst1.16 {d3[1]},[r12]
vst1.8 {d16[3]},[r3]
l1.1272:
@ ldr r3,[sp,#0x38]
cmp r5,#0
beq l1.964
@ checks for the flag q
cmp r10,#1
bne l1.1412
@ checks for the flag deq
vmov d2,d7
asr r3,r6,#1
vdup.8 d6,r3
rsb r3,r3,#0
vdup.8 d16,r3
vaddl.u8 q1,d2,d4
vaddw.u8 q1,q1,d24
vshr.u16 q1,q1,#1
vmovn.i16 d2,q1
vsubl.u8 q1,d2,d5
vsubw.s8 q1,q1,d20
vshr.s16 q1,q1,#1
vqmovn.s16 d3,q1
vmin.s8 d2,d3,d6
vmax.s8 d3,d16,d2
@ vdup.8 d6,r2
@ vmul.i8 d6,d6,d1
vmovl.u8 q8,d5
vmovl.s8 q1,d3
vadd.i16 q1,q8,q1
vqmovun.s16 d3,q1
vmov d30,d5
vcge.u8 d5,d0,d1
vbsl d5,d30,d3
l1.1412:
@ vdup.8 d2,r2
add r3,r0,#2
add r11,r3,r1
@ vmul.i8 d1,d2,d1
vst1.8 {d7[0]},[r3]
vst1.8 {d7[1]},[r11],r1
vst1.8 {d7[2]},[r11],r1
vcge.u8 d0,d0,d1
vst1.8 {d7[3]},[r11]
vbsl d0,d4,d23
vtrn.8 d0,d5
vst1.16 {d0[0]},[r0],r1
vst1.16 {d5[0]},[r0],r1
vst1.16 {d0[1]},[r0],r1
vst1.16 {d5[1]},[r0]
pop {r3-r12,pc}
/******************************************************************************
*
* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at:
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
******************************************************************************/
/**
*******************************************************************************
* @file
* ihevc_func_selector.h
*
* @brief
* For each function decide whether to use C function, or Neon intrinsics
* or Cortex A8 intrinsics or Neon assembly or cortex a8 assembly
*
* @author
* Harish
*
* @remarks
* None
*
*******************************************************************************
*/
#ifndef __IHEVC_FUNC_SELECTOR_H__
#define __IHEVC_FUNC_SELECTOR_H__
#include "ihevc_func_types.h"
#define INTER_PRED_LUMA_COPY C
#define INTER_PRED_LUMA_HORZ C
#define INTER_PRED_LUMA_VERT C
#define INTER_PRED_LUMA_COPY_W16OUT C
#define INTER_PRED_LUMA_HORZ_W16OUT C
#define INTER_PRED_LUMA_VERT_W16OUT C
#define INTER_PRED_LUMA_VERT_W16INP C
#define INTER_PRED_LUMA_VERT_W16INP_W16OUT C
#define INTER_PRED_CHROMA_COPY C
#define INTER_PRED_CHROMA_HORZ C
#define INTER_PRED_CHROMA_VERT C
#define INTER_PRED_CHROMA_COPY_W16OUT C
#define INTER_PRED_CHROMA_HORZ_W16OUT C
#define INTER_PRED_CHROMA_VERT_W16OUT C
#define INTER_PRED_CHROMA_VERT_W16INP C
#define INTER_PRED_CHROMA_VERT_W16INP_W16OUT C
#define WEIGHTED_PRED_UNI C
#define WEIGHTED_PRED_BI C
#define WEIGHTED_PRED_BI_DEFAULT C
#define WEIGHTED_PRED_CHROMA_UNI C
#define WEIGHTED_PRED_CHROMA_BI C
#define WEIGHTED_PRED_CHROMA_BI_DEFAULT C
#define PAD_VERT C
#define PAD_HORZ C
#define PAD_LEFT_LUMA C
#define PAD_LEFT_CHROMA C
#define PAD_RIGHT_LUMA C
#define PAD_RIGHT_CHROMA C
#define DEBLOCKING_ASM C
#define DEBLK_LUMA_HORZ C
#define DEBLK_LUMA_VERT C
#define DEBLK_CHROMA_HORZ C
#define DEBLK_CHROMA_VERT C
#define SAO_BAND_OFFSET_LUMA C
#define SAO_BAND_OFFSET_CHROMA C
#define SAO_EDGE_OFFSET_CLASS0_LUMA C
#define SAO_EDGE_OFFSET_CLASS1_LUMA C
#define SAO_EDGE_OFFSET_CLASS2_LUMA C
#define SAO_EDGE_OFFSET_CLASS3_LUMA C
#define SAO_EDGE_OFFSET_CLASS0_CHROMA C
#define SAO_EDGE_OFFSET_CLASS1_CHROMA C
#define SAO_EDGE_OFFSET_CLASS2_CHROMA C
#define SAO_EDGE_OFFSET_CLASS3_CHROMA C
#define INTRA_PRED_LUMA_REF_SUBSTITUTION C
#define INTRA_PRED_REF_FILTERING C
#define INTRA_PRED_LUMA_PLANAR C
#define INTRA_PRED_LUMA_DC C
#define INTRA_PRED_LUMA_HORZ C
#define INTRA_PRED_LUMA_VER C
#define INTRA_PRED_LUMA_MODE_2 C
#define INTRA_PRED_LUMA_MODE_18_34 C
#define INTRA_PRED_LUMA_MODE_3_T0_9 C
#define INTRA_PRED_LUMA_MODE_11_T0_17 C
#define INTRA_PRED_LUMA_MODE_19_T0_25 C
#define INTRA_PRED_LUMA_MODE_27_T0_33 C
#define INTRA_PRED_CHROMA_PLANAR C
#define INTRA_PRED_CHROMA_DC C
#define INTRA_PRED_CHROMA_HOR C
#define INTRA_PRED_CHROMA_VER C
#define INTRA_PRED_CHROMA_MODE_2 C
#define INTRA_PRED_CHROMA_18_34 C
#define INTRA_PRED_CHROMA_3_T0_9 C
#define INTRA_PRED_CHROMA_11_T0_17 C
#define INTRA_PRED_CHROMA_19_T0_25 C
#define INTRA_PRED_CHROMA_27_T0_33 C
#define INTRA_PRED_CHROMA_REF_SUBSTITUTION C
/* Forward transform functions */
/* Luma */
#define RESI_TRANS_QUANT_4X4_TTYPE1 C
#define RESI_TRANS_QUANT_4X4 C
#define RESI_TRANS_QUANT_8X8 C
#define RESI_TRANS_QUANT_16X16 C
#define RESI_TRANS_QUANT_32X32 C
#define RESI_QUANT_4X4_TTYPE1 C
#define RESI_QUANT_4X4 C
#define RESI_QUANT_8X8 C
#define RESI_QUANT_16X16 C
#define RESI_QUANT_32X32 C
#define RESI_TRANS_4X4_TTYPE1 C
#define RESI_TRANS_4X4 C
#define RESI_TRANS_8X8 C
#define RESI_TRANS_16X16 C
#define RESI_TRANS_32X32 C
#define RESI_4X4_TTYPE1 C
#define RESI_4X4 C
#define RESI_8X8 C
#define RESI_16X16 C
#define RESI_32X32 C
#define TRANS_4X4_TTYPE1 C
#define TRANS_4X4 C
#define TRANS_8X8 C
#define TRANS_16X16 C
#define TRANS_32X32 C
#define QUANT_4X4_TTYPE1 C
#define QUANT_4X4 C
#define QUANT_8X8 C
#define QUANT_16X16 C
#define QUANT_32X32 C
/* Chroma interleaved*/
#define CHROMA_RESI_TRANS_QUANT_4X4 C
#define CHROMA_RESI_TRANS_QUANT_8X8 C
#define CHROMA_RESI_TRANS_QUANT_16X16 C
#define CHROMA_RESI_QUANT_4X4 C
#define CHROMA_RESI_QUANT_8X8 C
#define CHROMA_RESI_QUANT_16X16 C
#define CHROMA_RESI_TRANS_4X4 C
#define CHROMA_RESI_TRANS_8X8 C
#define CHROMA_RESI_TRANS_16X16 C
#define CHROMA_RESI_4X4 C
#define CHROMA_RESI_8X8 C
#define CHROMA_RESI_16X16 C
/* Inverse transform functions */
/* Luma */
#define IQUANT_ITRANS_RECON_4X4_TTYPE1 C
#define IQUANT_ITRANS_RECON_4X4 C
#define IQUANT_ITRANS_RECON_8X8 C
#define IQUANT_ITRANS_RECON_16X16 C
#define IQUANT_ITRANS_RECON_32X32 C
#define IQUANT_RECON_4X4_TTYPE1 C
#define IQUANT_RECON_4X4 C
#define IQUANT_RECON_8X8 C
#define IQUANT_RECON_16X16 C
#define IQUANT_RECON_32X32 C
#define ITRANS_RECON_4X4_TTYPE1 C
#define ITRANS_RECON_4X4 C
#define ITRANS_RECON_8X8 C
#define ITRANS_RECON_16X16 C
#define ITRANS_RECON_32X32 C
#define RECON_4X4_TTYPE1 C
#define RECON_4X4 C
#define RECON_8X8 C
#define RECON_16X16 C
#define RECON_32X32 C
#define ITRANS_4X4_TTYPE1 C
#define ITRANS_4X4 C
#define ITRANS_8X8 C
#define ITRANS_16X16 C
#define ITRANS_32X32 C
/* Chroma interleaved */
#define CHROMA_IQUANT_ITRANS_RECON_4X4 C
#define CHROMA_IQUANT_ITRANS_RECON_8X8 C
#define CHROMA_IQUANT_ITRANS_RECON_16X16 C
#define CHROMA_IQUANT_RECON_4X4 C
#define CHROMA_IQUANT_RECON_8X8 C
#define CHROMA_IQUANT_RECON_16X16 C
#define CHROMA_ITRANS_RECON_4X4 C
#define CHROMA_ITRANS_RECON_8X8 C
#define CHROMA_ITRANS_RECON_16X16 C
#define CHROMA_RECON_4X4 C
#define CHROMA_RECON_8X8 C
#define CHROMA_RECON_16X16 C
#define IHEVC_MEMCPY C
#define IHEVC_MEMSET C
#define IHEVC_MEMSET_16BIT C
#define IHEVC_MEMCPY_MUL_8 C
#define IHEVC_MEMSET_MUL_8 C
#define IHEVC_MEMSET_16BIT_MUL_8 C
#endif /* __IHEVC_FUNC_SELECTOR_H__ */
@/*****************************************************************************
@*
@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
@*
@* Licensed under the Apache License, Version 2.0 (the "License");
@* you may not use this file except in compliance with the License.
@* You may obtain a copy of the License at:
@*
@* http://www.apache.org/licenses/LICENSE-2.0
@*
@* Unless required by applicable law or agreed to in writing, software
@* distributed under the License is distributed on an "AS IS" BASIS,
@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@* See the License for the specific language governing permissions and
@* limitations under the License.
@*
@*****************************************************************************/
@/**
@*******************************************************************************
@* @file
@* ihevc_inter_pred_chroma_copy_neon.s
@*
@* @brief
@* contains function definitions for inter prediction interpolation.
@* functions are coded using neon intrinsics and can be compiled using
@* rvct
@*
@* @author
@* yogeswaran rs
@*
@* @par list of functions:
@*
@*
@* @remarks
@* none
@*
@*******************************************************************************
@*/
@/**
@*******************************************************************************
@*
@* @brief
@* chroma interprediction filter for copy
@*
@* @par description:
@* copies the array of width 'wd' and height 'ht' from the location pointed
@* by 'src' to the location pointed by 'dst'
@*
@* @param[in] pu1_src
@* uword8 pointer to the source
@*
@* @param[out] pu1_dst
@* uword8 pointer to the destination
@*
@* @param[in] src_strd
@* integer source stride
@*
@* @param[in] dst_strd
@* integer destination stride
@*
@* @param[in] pi1_coeff
@* word8 pointer to the filter coefficients
@*
@* @param[in] ht
@* integer height of the array
@*
@* @param[in] wd
@* integer width of the array
@*
@* @returns
@*
@* @remarks
@* none
@*
@*******************************************************************************
@*/
@void ihevc_inter_pred_chroma_copy( uword8 *pu1_src,
@ uword8 *pu1_dst,
@ word32 src_strd,
@ word32 dst_strd,
@ word8 *pi1_coeff,
@ word32 ht,
@ word32 wd)
@**************variables vs registers*****************************************
@ r0 => *pu1_src
@ r1 => *pu1_dst
@ r2 => src_strd
@ r3 => dst_strd
@ r4 => *pi1_coeff
@ r5 => ht
@ r6 => wd
.text
.align 4
.globl ihevc_inter_pred_chroma_copy_a9q
.type ihevc_inter_pred_chroma_copy_a9q, %function
ihevc_inter_pred_chroma_copy_a9q:
stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments
ldr r12,[sp,#48] @loads wd
lsl r12,r12,#1
ldr r7,[sp,#44] @loads ht
cmp r7,#0 @checks ht == 0
ble end_loops
and r8,r7,#3 @check ht for mul of 2
sub r7,r7,r8 @check the rounded height value
tst r12,#15 @checks wd for multiples for 4 & 8
beq core_loop_wd_16
tst r12,#7 @checks wd for multiples for 4 & 8
beq core_loop_wd_8
sub r11,r12,#4
cmp r7,#0
beq outer_loop_wd_4_ht_2
outer_loop_wd_4:
subs r4,r12,#0 @checks wd == 0
ble end_inner_loop_wd_4
inner_loop_wd_4:
vld1.32 {d0[0]},[r0] @vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
add r5,r0,r2 @pu1_src_tmp += src_strd
add r6,r1,r3 @pu1_dst_tmp += dst_strd
vst1.32 {d0[0]},[r1] @vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
vld1.32 {d0[0]},[r5],r2 @vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
add r0,r0,#4 @pu1_src += 4
vst1.32 {d0[0]},[r6],r3 @vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
vld1.32 {d0[0]},[r5],r2 @vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
subs r4,r4,#4 @(wd -4)
vst1.32 {d0[0]},[r6],r3 @vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
vld1.32 {d0[0]},[r5],r2 @vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
add r1,r1,#4 @pu1_dst += 4
vst1.32 {d0[0]},[r6],r3 @vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
bgt inner_loop_wd_4
end_inner_loop_wd_4:
subs r7,r7,#4 @ht - 4
sub r0,r5,r11 @pu1_src = pu1_src_tmp
sub r1,r6,r11 @pu1_dst = pu1_dst_tmp
bgt outer_loop_wd_4
cmp r8,#0
bgt outer_loop_wd_4_ht_2
end_loops:
ldmfd sp!,{r4-r12,r15} @reload the registers from sp
outer_loop_wd_4_ht_2:
subs r4,r12,#0 @checks wd == 0
ble end_loops
inner_loop_wd_4_ht_2:
vld1.32 {d0[0]},[r0] @vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
add r5,r0,r2 @pu1_src_tmp += src_strd
add r6,r1,r3 @pu1_dst_tmp += dst_strd
vst1.32 {d0[0]},[r1] @vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
vld1.32 {d0[0]},[r5],r2 @vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
add r0,r0,#4 @pu1_src += 4
vst1.32 {d0[0]},[r6],r3 @vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
subs r4,r4,#4 @(wd -4)
add r1,r1,#4 @pu1_dst += 4
bgt inner_loop_wd_4_ht_2
b end_loops
core_loop_wd_8:
sub r11,r12,#8
cmp r7,#0
beq outer_loop_wd_8_ht_2
outer_loop_wd_8:
subs r4,r12,#0 @checks wd
ble end_inner_loop_wd_8
inner_loop_wd_8:
add r5,r0,r2 @pu1_src_tmp += src_strd
vld1.8 {d0},[r0]! @vld1_u8(pu1_src_tmp)
add r6,r1,r3 @pu1_dst_tmp += dst_strd
vst1.8 {d0},[r1]! @vst1_u8(pu1_dst_tmp, tmp_src)
vld1.8 {d1},[r5],r2 @vld1_u8(pu1_src_tmp)
vst1.8 {d1},[r6],r3 @vst1_u8(pu1_dst_tmp, tmp_src)
subs r4,r4,#8 @wd - 8(loop condition)
vld1.8 {d2},[r5],r2 @vld1_u8(pu1_src_tmp)
vst1.8 {d2},[r6],r3 @vst1_u8(pu1_dst_tmp, tmp_src)
vld1.8 {d3},[r5],r2 @vld1_u8(pu1_src_tmp)
vst1.8 {d3},[r6],r3 @vst1_u8(pu1_dst_tmp, tmp_src)
bgt inner_loop_wd_8
end_inner_loop_wd_8:
subs r7,r7,#4 @ht -= 4
sub r0,r5,r11 @pu1_src = pu1_src_tmp
sub r1,r6,r11 @pu1_dst = pu1_dst_tmp
bgt outer_loop_wd_8
cmp r8,#0
bgt outer_loop_wd_8_ht_2
b end_loops
outer_loop_wd_8_ht_2:
subs r4,r12,#0 @checks wd
ble end_loops
inner_loop_wd_8_ht_2:
add r5,r0,r2 @pu1_src_tmp += src_strd
vld1.8 {d0},[r0]! @vld1_u8(pu1_src_tmp)
add r6,r1,r3 @pu1_dst_tmp += dst_strd
vst1.8 {d0},[r1]! @vst1_u8(pu1_dst_tmp, tmp_src)
vld1.8 {d1},[r5],r2 @vld1_u8(pu1_src_tmp)
vst1.8 {d1},[r6],r3 @vst1_u8(pu1_dst_tmp, tmp_src)
@subs r4,r4,#8 @wd - 8(loop condition)
@bgt inner_loop_wd_8_ht_2
b end_loops
core_loop_wd_16:
sub r11,r12,#16
cmp r7,#0
beq outer_loop_wd_16_ht_2
outer_loop_wd_16:
subs r4,r12,#0 @checks wd
ble end_inner_loop_wd_16
inner_loop_wd_16:
add r5,r0,r2 @pu1_src_tmp += src_strd
vld1.8 {q0},[r0]! @vld1_u8(pu1_src_tmp)
add r6,r1,r3 @pu1_dst_tmp += dst_strd
vst1.8 {q0},[r1]! @vst1_u8(pu1_dst_tmp, tmp_src)
vld1.8 {q1},[r5],r2 @vld1_u8(pu1_src_tmp)
vst1.8 {q1},[r6],r3 @vst1_u8(pu1_dst_tmp, tmp_src)
subs r4,r4,#16 @wd - 16(loop condition)
vld1.8 {q2},[r5],r2 @vld1_u8(pu1_src_tmp)
vst1.8 {q2},[r6],r3 @vst1_u8(pu1_dst_tmp, tmp_src)
vld1.8 {q3},[r5],r2 @vld1_u8(pu1_src_tmp)
vst1.8 {q3},[r6],r3 @vst1_u8(pu1_dst_tmp, tmp_src)
bgt inner_loop_wd_16
end_inner_loop_wd_16:
subs r7,r7,#4 @ht -= 4
sub r0,r5,r11 @pu1_src = pu1_src_tmp
sub r1,r6,r11 @pu1_dst = pu1_dst_tmp
bgt outer_loop_wd_16
cmp r8,#0
bgt outer_loop_wd_16_ht_2
b end_loops
outer_loop_wd_16_ht_2:
subs r4,r12,#0 @checks wd
ble end_loops
inner_loop_wd_16_ht_2:
add r5,r0,r2 @pu1_src_tmp += src_strd
vld1.8 {q0},[r0]! @vld1_u8(pu1_src_tmp)
add r6,r1,r3 @pu1_dst_tmp += dst_strd
vst1.8 {q0},[r1]! @vst1_u8(pu1_dst_tmp, tmp_src)
vld1.8 {q1},[r5],r2 @vld1_u8(pu1_src_tmp)
vst1.8 {q1},[r6],r3 @vst1_u8(pu1_dst_tmp, tmp_src)
@subs r4,r4,#16 @wd - 16(loop condition)
@bgt inner_loop_wd_16_ht_2
ldmfd sp!,{r4-r12,r15} @reload the registers from sp
@/*****************************************************************************
@*
@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
@*
@* Licensed under the Apache License, Version 2.0 (the "License");
@* you may not use this file except in compliance with the License.
@* You may obtain a copy of the License at:
@*
@* http://www.apache.org/licenses/LICENSE-2.0
@*
@* Unless required by applicable law or agreed to in writing, software
@* distributed under the License is distributed on an "AS IS" BASIS,
@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@* See the License for the specific language governing permissions and
@* limitations under the License.
@*
@*****************************************************************************/
@/**
@*******************************************************************************
@* @file
@* ihevc_inter_pred_chroma_copy_w16out_neon.s
@*
@* @brief
@* contains function definitions for inter prediction interpolation.
@* functions are coded using neon intrinsics and can be compiled using
@* rvct
@*
@* @author
@* yogeswaran rs
@*
@* @par list of functions:
@*
@*
@* @remarks
@* none
@*
@*******************************************************************************
@*/
@/**
@*******************************************************************************
@*
@* @brief
@* chroma interprediction filter for copy
@*
@* @par description:
@* copies the array of width 'wd' and height 'ht' from the location pointed
@* by 'src' to the location pointed by 'dst'
@*
@* @param[in] pu1_src
@* uword8 pointer to the source
@*
@* @param[out] pu1_dst
@* uword8 pointer to the destination
@*
@* @param[in] src_strd
@* integer source stride
@*
@* @param[in] dst_strd
@* integer destination stride
@*
@* @param[in] pi1_coeff
@* word8 pointer to the filter coefficients
@*
@* @param[in] ht
@* integer height of the array
@*
@* @param[in] wd
@* integer width of the array
@*
@* @returns
@*
@* @remarks
@* none
@*
@*******************************************************************************
@*/
@void ihevc_inter_pred_chroma_copy_w16out(uword8 *pu1_src,
@ word16 *pi2_dst,
@ word32 src_strd,
@ word32 dst_strd,
@ word8 *pi1_coeff,
@ word32 ht,
@ word32 wd)
@**************variables vs registers*****************************************
@r0 => *pu1_src
@r1 => *pi2_dst
@r2 => src_strd
@r3 => dst_strd
@r4 => *pi1_coeff
@r5 => ht
@r6 => wd
.text
.align 4
.globl ihevc_inter_pred_chroma_copy_w16out_a9q
.type ihevc_inter_pred_chroma_copy_w16out_a9q, %function
ihevc_inter_pred_chroma_copy_w16out_a9q:
stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments
ldr r12,[sp,#48] @loads wd
lsl r12,r12,#1 @2*wd
ldr r7,[sp,#44] @loads ht
cmp r7,#0 @ht condition(ht == 0)
ble end_loops @loop
and r8,r7,#3 @check ht for mul of 2
sub r9,r7,r8 @check the rounded height value
and r11,r7,#6
cmp r11,#6
beq loop_ht_6
tst r12,#7 @conditional check for wd (multiples)
beq core_loop_wd_8
loop_ht_6:
sub r11,r12,#4
lsls r6,r3,#1
cmp r9,#0
beq outer_loop_wd_4_ht_2
outer_loop_wd_4:
subs r4,r12,#0 @wd conditional subtract
ble end_inner_loop_wd_4
inner_loop_wd_4:
vld1.8 {d0},[r0] @vld1_u8(pu1_src_tmp)
add r5,r0,r2 @pu1_src +src_strd
vmovl.u8 q0,d0 @vmovl_u8(vld1_u8(pu1_src_tmp)
add r10,r1,r6
subs r4,r4,#4 @wd - 4
vshl.i64 q0,q0,#6 @vshlq_n_s64(temp, 6)
vld1.8 {d22},[r5],r2 @vld1_u8(pu1_src_tmp)
add r0,r0,#4 @pu1_src += 4
vst1.64 {d0},[r1] @vst1q_lane_s64(pi2_dst_tmp, temp, 0)
add r1,r1,#8
vmovl.u8 q11,d22 @vmovl_u8(vld1_u8(pu1_src_tmp)
vld1.8 {d24},[r5],r2 @vld1_u8(pu1_src_tmp)
vshl.i64 q11,q11,#6 @vshlq_n_s64(temp, 6)
vmovl.u8 q12,d24 @vmovl_u8(vld1_u8(pu1_src_tmp)
vst1.64 {d22},[r10],r6 @vst1q_lane_s64(pi2_dst_tmp, temp, 0)
vshl.i64 q12,q12,#6 @vshlq_n_s64(temp, 6)
vld1.8 {d26},[r5],r2 @vld1_u8(pu1_src_tmp)
vst1.64 {d24},[r10],r6 @vst1q_lane_s64(pi2_dst_tmp, temp, 0)
vmovl.u8 q13,d26 @vmovl_u8(vld1_u8(pu1_src_tmp)
vshl.i64 q13,q13,#6 @vshlq_n_s64(temp, 6)
vst1.64 {d26},[r10],r6 @vst1q_lane_s64(pi2_dst_tmp, temp, 0)
bgt inner_loop_wd_4
end_inner_loop_wd_4:
subs r9,r9,#4 @ht - 4
sub r0,r5,r11
sub r1,r10,r11,lsl #1
bgt outer_loop_wd_4
cmp r8,#0
bgt outer_loop_wd_4_ht_2
end_loops:
ldmfd sp!,{r4-r12,r15} @reload the registers from sp
outer_loop_wd_4_ht_2:
subs r4,r12,#0 @wd conditional subtract
ble end_inner_loop_wd_4
inner_loop_wd_4_ht_2:
vld1.8 {d0},[r0] @vld1_u8(pu1_src_tmp)
add r5,r0,r2 @pu1_src +src_strd
vmovl.u8 q0,d0 @vmovl_u8(vld1_u8(pu1_src_tmp)
add r10,r1,r6
subs r4,r4,#4 @wd - 4
vshl.i64 q0,q0,#6 @vshlq_n_s64(temp, 6)
vld1.8 {d22},[r5],r2 @vld1_u8(pu1_src_tmp)
add r0,r0,#4 @pu1_src += 4
vst1.64 {d0},[r1] @vst1q_lane_s64(pi2_dst_tmp, temp, 0)
add r1,r1,#8
vmovl.u8 q11,d22 @vmovl_u8(vld1_u8(pu1_src_tmp)
vld1.8 {d24},[r5],r2 @vld1_u8(pu1_src_tmp)
vshl.i64 q11,q11,#6 @vshlq_n_s64(temp, 6)
vmovl.u8 q12,d24 @vmovl_u8(vld1_u8(pu1_src_tmp)
vst1.64 {d22},[r10],r6 @vst1q_lane_s64(pi2_dst_tmp, temp, 0)
bgt inner_loop_wd_4_ht_2
b end_loops
core_loop_wd_8:
@sub r11,r12,#8
lsls r5,r3,#1
rsb r11,r12,r3, lsl #2 @ r11 = (dst_strd * 4) - width
rsb r8,r12,r2,lsl #2 @r2->src_strd
mov r4,r12, lsr #3 @ divide by 8
mov r7,r9
mul r7, r4
sub r4,r12,#0 @wd conditional check
sub r7,r7,#4 @subtract one for epilog
cmp r9,#0
beq core_loop_wd_8_ht_2
prolog:
add r6,r0,r2 @pu1_src_tmp += src_strd
add r10,r1,r5
vld1.8 {d8},[r0]! @vld1_u8(pu1_src_tmp)
vld1.8 {d10},[r6],r2 @vld1_u8(pu1_src_tmp)
vld1.8 {d12},[r6],r2 @vld1_u8(pu1_src_tmp)
vld1.8 {d14},[r6],r2 @vld1_u8(pu1_src_tmp)
vmovl.u8 q8,d8 @vmovl_u8(vld1_u8(pu1_src_tmp))
vmovl.u8 q9,d10 @vmovl_u8(vld1_u8(pu1_src_tmp)
vmovl.u8 q10,d12 @vmovl_u8(vld1_u8(pu1_src_tmp)
vmovl.u8 q11,d14 @vmovl_u8(vld1_u8(pu1_src_tmp)
subs r4,r4,#8 @wd decrements by 8
vshl.i16 q0,q8,#6 @vshlq_n_s16(tmp, 6)
vshl.i16 q1,q9,#6 @vshlq_n_s16(tmp, 6)
vshl.i16 q2,q10,#6 @vshlq_n_s16(tmp, 6)
vshl.i16 q3,q11,#6 @vshlq_n_s16(tmp, 6)
addle r0,r0,r8
add r6,r0,r2 @pu1_src_tmp += src_strd
vld1.8 {d8},[r0]! @vld1_u8(pu1_src_tmp)
vld1.8 {d10},[r6],r2 @vld1_u8(pu1_src_tmp)
vld1.8 {d12},[r6],r2 @vld1_u8(pu1_src_tmp)
vld1.8 {d14},[r6],r2 @vld1_u8(pu1_src_tmp)
vst1.16 {d0,d1},[r1]! @vst1q_s16(pi2_dst_tmp, tmp)
addle r1,r1,r11,lsl #1
suble r4,r12,#0 @wd conditional check
subs r7,r7,#4 @ht - 4
blt epilog_end @jumps to epilog_end
beq epilog @jumps to epilog
outer_loop_wd_8:
vst1.16 {d2,d3},[r10],r5 @vst1q_s16(pi2_dst_tmp, tmp)
vmovl.u8 q8,d8 @vmovl_u8(vld1_u8(pu1_src_tmp))
vst1.16 {d4,d5},[r10],r5 @vst1q_s16(pi2_dst_tmp, tmp)
vmovl.u8 q9,d10 @vmovl_u8(vld1_u8(pu1_src_tmp)
vst1.16 {d6,d7},[r10],r5 @vst1q_s16(pi2_dst_tmp, tmp)
vmovl.u8 q10,d12 @vmovl_u8(vld1_u8(pu1_src_tmp)
vmovl.u8 q11,d14 @vmovl_u8(vld1_u8(pu1_src_tmp)
subs r4,r4,#8 @wd decrements by 8
addle r0,r0,r8
add r6,r0,r2 @pu1_src_tmp += src_strd
vld1.8 {d8},[r0]! @vld1_u8(pu1_src_tmp)
vshl.i16 q0,q8,#6 @vshlq_n_s16(tmp, 6)
vld1.8 {d10},[r6],r2 @vld1_u8(pu1_src_tmp)
vshl.i16 q1,q9,#6 @vshlq_n_s16(tmp, 6)
vld1.8 {d12},[r6],r2 @vld1_u8(pu1_src_tmp)
vshl.i16 q2,q10,#6 @vshlq_n_s16(tmp, 6)
vld1.8 {d14},[r6],r2 @vld1_u8(pu1_src_tmp)
add r10,r1,r5
vshl.i16 q3,q11,#6 @vshlq_n_s16(tmp, 6)
vst1.16 {d0,d1},[r1]! @vst1q_s16(pi2_dst_tmp, tmp)
addle r1,r1,r11,lsl #1
suble r4,r12,#0 @wd conditional check
subs r7,r7,#4 @ht - 4
bgt outer_loop_wd_8
epilog:
vst1.16 {d2,d3},[r10],r5 @vst1q_s16(pi2_dst_tmp, tmp)
vmovl.u8 q8,d8 @vmovl_u8(vld1_u8(pu1_src_tmp))
vst1.16 {d4,d5},[r10],r5 @vst1q_s16(pi2_dst_tmp, tmp)
vmovl.u8 q9,d10 @vmovl_u8(vld1_u8(pu1_src_tmp)
vst1.16 {d6,d7},[r10],r5 @vst1q_s16(pi2_dst_tmp, tmp)
vmovl.u8 q10,d12 @vmovl_u8(vld1_u8(pu1_src_tmp)
vmovl.u8 q11,d14 @vmovl_u8(vld1_u8(pu1_src_tmp)
@add r6,r0,r2 @pu1_src_tmp += src_strd
vshl.i16 q0,q8,#6 @vshlq_n_s16(tmp, 6)
vshl.i16 q1,q9,#6 @vshlq_n_s16(tmp, 6)
vshl.i16 q2,q10,#6 @vshlq_n_s16(tmp, 6)
add r10,r1,r5
vshl.i16 q3,q11,#6 @vshlq_n_s16(tmp, 6)
vst1.16 {d0,d1},[r1]! @vst1q_s16(pi2_dst_tmp, tmp)
epilog_end:
vst1.16 {d2,d3},[r10],r5 @vst1q_s16(pi2_dst_tmp, tmp)
vst1.16 {d4,d5},[r10],r5 @vst1q_s16(pi2_dst_tmp, tmp)
vst1.16 {d6,d7},[r10],r5 @vst1q_s16(pi2_dst_tmp, tmp)
b end_loops
core_loop_wd_8_ht_2:
add r6,r0,r2 @pu1_src_tmp += src_strd
add r10,r1,r5
vld1.8 {d8},[r0]! @vld1_u8(pu1_src_tmp)
vld1.8 {d10},[r6],r2 @vld1_u8(pu1_src_tmp)
vmovl.u8 q8,d8 @vmovl_u8(vld1_u8(pu1_src_tmp))
vmovl.u8 q9,d10 @vmovl_u8(vld1_u8(pu1_src_tmp)
subs r12,r12,#8 @wd decrements by 8
vshl.i16 q0,q8,#6 @vshlq_n_s16(tmp, 6)
vshl.i16 q1,q9,#6 @vshlq_n_s16(tmp, 6)
vst1.16 {d0,d1},[r1]! @vst1q_s16(pi2_dst_tmp, tmp)
vst1.16 {d2,d3},[r10],r5 @vst1q_s16(pi2_dst_tmp, tmp)
bgt core_loop_wd_8_ht_2
ldmfd sp!,{r4-r12,r15} @reload the registers from sp
This diff is collapsed.
This diff is collapsed.
@/*****************************************************************************
@*
@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
@*
@* Licensed under the Apache License, Version 2.0 (the "License");
@* you may not use this file except in compliance with the License.
@* You may obtain a copy of the License at:
@*
@* http://www.apache.org/licenses/LICENSE-2.0
@*
@* Unless required by applicable law or agreed to in writing, software
@* distributed under the License is distributed on an "AS IS" BASIS,
@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@* See the License for the specific language governing permissions and
@* limitations under the License.
@*
@*****************************************************************************/
@/**
@*******************************************************************************
@* @file
@* ihevc_inter_pred_chroma_vert_neon.s
@*
@* @brief
@* contains function definitions for inter prediction interpolation.
@* functions are coded using neon intrinsics and can be compiled using
@* rvct
@*
@* @author
@* yogeswaran rs
@*
@* @par list of functions:
@*
@*
@* @remarks
@* none
@*
@*******************************************************************************
@*/
@/**
@/**
@*******************************************************************************
@*
@* @brief
@* chroma interprediction filter for vertical input
@*
@* @par description:
@* applies a vertical filter with coefficients pointed to by 'pi1_coeff' to
@* the elements pointed by 'pu1_src' and writes to the location pointed by
@* 'pu1_dst' the output is down shifted by 6 and clipped to 8 bits
@* assumptions : the function is optimized considering the fact width is
@* multiple of 2,4 or 8. and also considering height should be multiple of 2
@* width 4,8 is optimized further
@*
@* @param[in] pu1_src
@* uword8 pointer to the source
@*
@* @param[out] pu1_dst
@* uword8 pointer to the destination
@*
@* @param[in] src_strd
@* integer source stride
@*
@* @param[in] dst_strd
@* integer destination stride
@*
@* @param[in] pi1_coeff
@* word8 pointer to the filter coefficients
@*
@* @param[in] ht
@* integer height of the array
@*
@* @param[in] wd
@* integer width of the array
@*
@* @returns
@*
@* @remarks
@* none
@*
@*******************************************************************************
@*/
@void ihevc_inter_pred_chroma_vert(uword8 *pu1_src,
@ uword8 *pu1_dst,
@ word32 src_strd,
@ word32 dst_strd,
@ word8 *pi1_coeff,
@ word32 ht,
@ word32 wd)
@**************variables vs registers*****************************************
@r0 => *pu1_src
@r1 => *pi2_dst
@r2 => src_strd
@r3 => dst_strd
.text
.align 4
.globl ihevc_inter_pred_chroma_vert_a9q
.type ihevc_inter_pred_chroma_vert_a9q, %function
ihevc_inter_pred_chroma_vert_a9q:
stmfd sp!,{r4-r12,r14} @stack stores the values of the arguments
ldr r4,[sp,#44] @loads ht
ldr r12,[sp,#40] @loads pi1_coeff
cmp r4,#0 @checks ht == 0
ldr r6,[sp,#48] @loads wd
sub r0,r0,r2 @pu1_src - src_strd
vld1.8 {d0},[r12] @loads pi1_coeff
ble end_loops @jumps to end
tst r6,#3 @checks (wd & 3)
vabs.s8 d3,d0 @vabs_s8(coeff)
lsl r10,r6,#1 @2*wd
vdup.8 d0,d3[0] @coeffabs_0
vdup.8 d1,d3[1] @coeffabs_1
vdup.8 d2,d3[2] @coeffabs_2
vdup.8 d3,d3[3] @coeffabs_3
bgt outer_loop_wd_2 @jumps to loop handling wd ==2
tst r4,#7 @checks ht for mul of 8
beq core_loop_ht_8 @when height is multiple of 8
lsl r7,r3,#1 @2*dst_strd
sub r9,r7,r10 @2*dst_strd - 2wd
lsl r12,r2,#1 @2*src_strd
sub r8,r12,r10 @2*src_strd - 2wd
mov r5,r10 @2wd
inner_loop_ht_2: @called when wd is multiple of 4 and ht is 4,2
add r6,r0,r2 @pu1_src +src_strd
vld1.8 {d9},[r6],r2 @loads pu1_src
subs r5,r5,#8 @2wd - 8
vld1.8 {d5},[r0]! @loads src
vmull.u8 q3,d9,d1 @vmull_u8(vreinterpret_u8_u32(src_tmp2), coeffabs_1)
vld1.8 {d4},[r6],r2 @loads incremented src
vmlsl.u8 q3,d5,d0 @vmlsl_u8(mul_res1, vreinterpret_u8_u32(src_tmp1), coeffabs_0)
vld1.8 {d8},[r6],r2 @loads incremented src
vmlal.u8 q3,d4,d2 @vmlal_u8(mul_res1, vreinterpret_u8_u32(src_tmp3), coeffabs_2)
vmull.u8 q2,d4,d1
vmlsl.u8 q3,d8,d3
vmlsl.u8 q2,d9,d0
vld1.8 {d10},[r6] @loads the incremented src
vmlal.u8 q2,d8,d2
vqrshrun.s16 d6,q3,#6 @shifts right
vmlsl.u8 q2,d10,d3
add r6,r1,r3 @pu1_dst + dst_strd
vqrshrun.s16 d4,q2,#6 @shifts right
vst1.8 {d6},[r1]! @stores the loaded value
vst1.8 {d4},[r6] @stores the loaded value
bgt inner_loop_ht_2 @inner loop again
subs r4,r4,#2 @ht - 2
add r1,r1,r9 @pu1_dst += (2*dst_strd - 2wd)
mov r5,r10 @2wd
add r0,r0,r8 @pu1_src += (2*src_strd - 2wd)
bgt inner_loop_ht_2 @loop again
b end_loops @jumps to end
outer_loop_wd_2: @called when width is multiple of 2
lsl r5,r3,#1 @2*dst_strd
mov r12,r10 @2wd
sub r9,r5,r10 @2*dst_strd - 2wd
lsl r7,r2,#1 @2*src_strd
sub r8,r7,r10 @2*src_strd - 2wd
inner_loop_wd_2:
add r6,r0,r2 @pu1_src + src_strd
vld1.32 {d6[0]},[r0] @vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp1, 0
subs r12,r12,#4 @2wd - 4
add r0,r0,#4 @pu1_src + 4
vld1.32 {d6[1]},[r6],r2 @loads pu1_src_tmp
vdup.32 d7,d6[1]
vld1.32 {d7[1]},[r6],r2 @loads pu1_src_tmp
vmull.u8 q2,d7,d1 @vmull_u8(vreinterpret_u8_u32(src_tmp2), coeffabs_1)
vdup.32 d7,d7[1]
vld1.32 {d7[1]},[r6],r2
vmlsl.u8 q2,d6,d0
vmlal.u8 q2,d7,d2
vdup.32 d7,d7[1]
vld1.32 {d7[1]},[r6]
add r6,r1,r3 @pu1_dst + dst_strd
vmlsl.u8 q2,d7,d3
vqrshrun.s16 d4,q2,#6 @vrshrq_n_s16(vreinterpretq_s16_u16(mul_res1),6)
vst1.32 {d4[0]},[r1] @stores the loaded value
add r1,r1,#4 @pu1_dst += 4
vst1.32 {d4[1]},[r6] @stores the loaded value
bgt inner_loop_wd_2 @inner loop again
@inner loop ends
subs r4,r4,#2 @ht - 2
add r1,r1,r9 @pu1_dst += 2*dst_strd - 2*wd
mov r12,r10 @2wd
add r0,r0,r8 @pu1_src += 2*src_strd - 2*wd
bgt inner_loop_wd_2 @loop again
b end_loops @jumps to end
core_loop_ht_8: @when wd & ht is multiple of 8
lsl r12,r3,#2 @4*dst_strd
sub r8,r12,r10 @4*dst_strd - 2wd
lsl r12,r2,#2 @4*src_strd
sub r9,r12,r10 @4*src_strd - 2wd
bic r5,r10,#7 @r5 ->wd
mov r14,r10,lsr #3 @divide by 8
mul r12,r4,r14 @multiply height by width
sub r12,#4 @subtract by one for epilog
prolog:
add r6,r0,r2 @pu1_src + src_strd
vld1.8 {d5},[r6],r2 @loads pu1_src
subs r5,r5,#8 @2wd - 8
vld1.8 {d4},[r0]! @loads the source
vld1.8 {d6},[r6],r2 @load and increment
vmull.u8 q15,d5,d1 @mul with coeff 1
vld1.8 {d7},[r6],r2 @load and increment
vmlsl.u8 q15,d4,d0
add r7,r1,r3 @pu1_dst
vmlal.u8 q15,d6,d2
vmlsl.u8 q15,d7,d3
vld1.8 {d8},[r6],r2 @load and increment
vmull.u8 q14,d6,d1 @mul_res 2
addle r0,r0,r9 @pu1_dst += 4*dst_strd - 2*wd
vmlsl.u8 q14,d5,d0
bicle r5,r10,#7 @r5 ->wd
vmlal.u8 q14,d7,d2
vld1.8 {d9},[r6],r2
vmlsl.u8 q14,d8,d3
vqrshrun.s16 d30,q15,#6
vld1.8 {d10},[r6],r2
vmull.u8 q13,d7,d1
add r6,r0,r2 @pu1_src + src_strd
vmlsl.u8 q13,d6,d0
vst1.8 {d30},[r1]! @stores the loaded value
vmlal.u8 q13,d8,d2
vld1.8 {d4},[r0]! @loads the source
vmlsl.u8 q13,d9,d3
vqrshrun.s16 d28,q14,#6
addle r1,r1,r8 @pu1_src += 4*src_strd - 2*wd
vmull.u8 q12,d8,d1
vld1.8 {d5},[r6],r2 @loads pu1_src
vmlsl.u8 q12,d7,d0
subs r12,r12,#4
vld1.8 {d6},[r6],r2 @load and increment
vmlal.u8 q12,d9,d2
vld1.8 {d7},[r6],r2 @load and increment
vmlsl.u8 q12,d10,d3
lsl r11,r2,#2
vst1.8 {d28},[r7],r3 @stores the loaded value
vqrshrun.s16 d26,q13,#6
rsb r11,r2,r2,lsl #3
add r14,r2,r2,lsl #1
add r14,r14,r11
ble epilog @jumps to epilog
kernel_8:
vmull.u8 q15,d5,d1 @mul with coeff 1
subs r5,r5,#8 @2wd - 8
vmlsl.u8 q15,d4,d0
addle r0,r0,r9 @pu1_dst += 4*dst_strd - 2*wd
vmlal.u8 q15,d6,d2
rsble r11,r2,r2,lsl #3
vmlsl.u8 q15,d7,d3
vst1.8 {d26},[r7],r3 @stores the loaded value
vqrshrun.s16 d24,q12,#6
vld1.8 {d8},[r6],r2 @load and increment
vmull.u8 q14,d6,d1 @mul_res 2
bicle r5,r10,#7 @r5 ->wd
vmlsl.u8 q14,d5,d0
vst1.8 {d24},[r7],r3 @stores the loaded value
vmlal.u8 q14,d7,d2
vld1.8 {d9},[r6],r2
vqrshrun.s16 d30,q15,#6
vmlsl.u8 q14,d8,d3
vld1.8 {d10},[r6],r2
add r7,r1,r3 @pu1_dst
vmull.u8 q13,d7,d1
add r6,r0,r2 @pu1_src + src_strd
pld [r0,r11]
vmlsl.u8 q13,d6,d0
vld1.8 {d4},[r0]! @loads the source
vmlal.u8 q13,d8,d2
vst1.8 {d30},[r1]! @stores the loaded value
vmlsl.u8 q13,d9,d3
vld1.8 {d5},[r6],r2 @loads pu1_src
add r11,r11,r2
vqrshrun.s16 d28,q14,#6
vmull.u8 q12,d8,d1
vld1.8 {d6},[r6],r2 @load and increment
addle r1,r1,r8 @pu1_src += 4*src_strd - 2*wd
cmp r11,r14
rsbgt r11,r2,r2,lsl #3
vmlsl.u8 q12,d7,d0
subs r12,r12,#4
vmlal.u8 q12,d9,d2
vld1.8 {d7},[r6],r2 @load and increment
vmlsl.u8 q12,d10,d3
vst1.8 {d28},[r7],r3 @stores the loaded value
vqrshrun.s16 d26,q13,#6
bgt kernel_8 @jumps to kernel_8
epilog:
vmull.u8 q15,d5,d1 @mul with coeff 1
vmlsl.u8 q15,d4,d0
vmlal.u8 q15,d6,d2
vmlsl.u8 q15,d7,d3
vst1.8 {d26},[r7],r3 @stores the loaded value
vqrshrun.s16 d24,q12,#6
vld1.8 {d8},[r6],r2 @load and increment
vmull.u8 q14,d6,d1 @mul_res 2
vmlsl.u8 q14,d5,d0
vmlal.u8 q14,d7,d2
vmlsl.u8 q14,d8,d3
vst1.8 {d24},[r7],r3 @stores the loaded value
vqrshrun.s16 d30,q15,#6
vld1.8 {d9},[r6],r2
vmull.u8 q13,d7,d1
add r7,r1,r3 @pu1_dst
vmlsl.u8 q13,d6,d0
vst1.8 {d30},[r1]! @stores the loaded value
vqrshrun.s16 d28,q14,#6
vmlal.u8 q13,d8,d2
vld1.8 {d10},[r6],r2
vmlsl.u8 q13,d9,d3
vmull.u8 q12,d8,d1
vqrshrun.s16 d26,q13,#6
vst1.8 {d28},[r7],r3 @stores the loaded value
vmlsl.u8 q12,d7,d0
vmlal.u8 q12,d9,d2
vst1.8 {d26},[r7],r3 @stores the loaded value
vmlsl.u8 q12,d10,d3
vqrshrun.s16 d24,q12,#6
vst1.8 {d24},[r7],r3 @stores the loaded value
end_loops:
ldmfd sp!,{r4-r12,r15} @reload the registers from sp
@/*****************************************************************************
@*
@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
@*
@* Licensed under the Apache License, Version 2.0 (the "License");
@* you may not use this file except in compliance with the License.
@* You may obtain a copy of the License at:
@*
@* http://www.apache.org/licenses/LICENSE-2.0
@*
@* Unless required by applicable law or agreed to in writing, software
@* distributed under the License is distributed on an "AS IS" BASIS,
@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@* See the License for the specific language governing permissions and
@* limitations under the License.
@*
@*****************************************************************************/
@/**
@*******************************************************************************
@* @file
@* ihevc_inter_pred_chroma_vert_neon_w16inp_neon.s
@*
@* @brief
@* contains function definitions for inter prediction interpolation.
@* functions are coded using neon intrinsics and can be compiled using
@* rvct
@*
@* @author
@* yogeswaran rs / parthiban
@*
@* @par list of functions:
@*
@*
@* @remarks
@* none
@*
@*******************************************************************************
@*/
@/**
@/**
@*******************************************************************************
@*
@* @brief
@* chroma interprediction filter for 16bit vertical input.
@*
@* @par description:
@* applies a vertical filter with coefficients pointed to by 'pi1_coeff' to
@* the elements pointed by 'pu1_src' and writes to the location pointed by
@* 'pu1_dst' input is 16 bits the filter output is downshifted by 12 and
@* clipped to lie between 0 and 255 assumptions : the function is
@* optimized considering the fact width and height are multiple of 2.
@*
@* @param[in] pi2_src
@* word16 pointer to the source
@*
@* @param[out] pu1_dst
@* uword8 pointer to the destination
@*
@* @param[in] src_strd
@* integer source stride
@*
@* @param[in] dst_strd
@* integer destination stride
@*
@* @param[in] pi1_coeff
@* word8 pointer to the filter coefficients
@*
@* @param[in] ht
@* integer height of the array
@*
@* @param[in] wd
@* integer width of the array
@*
@* @returns
@*
@* @remarks
@* none
@*
@*******************************************************************************
@*/
@void ihevc_inter_pred_chroma_vert_w16inp(word16 *pi2_src,
@ uword8 *pu1_dst,
@ word32 src_strd,
@ word32 dst_strd,
@ word8 *pi1_coeff,
@ word32 ht,
@ word32 wd)
@**************variables vs registers*****************************************
@r0 => *pu1_src
@r1 => *pi2_dst
@r2 => src_strd
@r3 => dst_strd
.text
.align 4
.globl ihevc_inter_pred_chroma_vert_w16inp_a9q
.type ihevc_inter_pred_chroma_vert_w16inp_a9q, %function
ihevc_inter_pred_chroma_vert_w16inp_a9q:
stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments
ldr r4, [sp,#40] @loads pi1_coeff
ldr r6, [sp,#48] @wd
lsl r2,r2,#1 @src_strd = 2* src_strd
ldr r5,[sp,#44] @loads ht
vld1.8 {d0},[r4] @loads pi1_coeff
sub r4,r0,r2 @pu1_src - src_strd
vmovl.s8 q0,d0 @long the value
tst r6,#3 @checks wd == 2
vdup.16 d12,d0[0] @coeff_0
vdup.16 d13,d0[1] @coeff_1
vdup.16 d14,d0[2] @coeff_2
vdup.16 d15,d0[3] @coeff_3
bgt core_loop_ht_2 @jumps to loop handles wd 2
tst r5,#3 @checks ht == mul of 4
beq core_loop_ht_4 @jumps to loop handles ht mul of 4
core_loop_ht_2:
lsl r7,r2,#1 @2*src_strd
lsl r12,r3,#1 @2*dst_strd
lsl r9,r6,#2 @4*wd
sub r6,r12,r6,lsl #1 @2*dst_strd - 2*wd
sub r8,r7,r9 @2*src_strd - 4*wd
mov r12,r9 @4wd
inner_loop_ht_2:
add r0,r4,r2 @increments pi2_src
vld1.16 {d0},[r4]! @loads pu1_src
vmull.s16 q0,d0,d12 @vmull_s16(src_tmp1, coeff_0)
subs r12,r12,#8 @2wd + 8
vld1.16 {d2},[r0],r2 @loads pi2_src
vmull.s16 q4,d2,d12 @vmull_s16(src_tmp2, coeff_0)
vld1.16 {d3},[r0],r2 @loads pi2_src
vmlal.s16 q0,d2,d13
vld1.16 {d6},[r0],r2
vmlal.s16 q4,d3,d13
vld1.16 {d2},[r0]
add r7,r1,r3 @pu1_dst + dst_strd
vmlal.s16 q0,d3,d14
vmlal.s16 q4,d6,d14
vmlal.s16 q0,d6,d15
vmlal.s16 q4,d2,d15
vqshrn.s32 d0,q0,#6 @right shift
vqshrn.s32 d30,q4,#6 @right shift
vqrshrun.s16 d0,q0,#6 @rounding shift
vqrshrun.s16 d30,q15,#6 @rounding shift
vst1.32 {d0[0]},[r1]! @stores the loaded value
vst1.32 {d30[0]},[r7] @stores the loaded value
bgt inner_loop_ht_2 @inner loop -again
@inner loop ends
subs r5,r5,#2 @increments ht
add r1,r1,r6 @pu1_dst += 2*dst_strd - 2*wd
mov r12,r9 @4wd
add r4,r4,r8 @pi1_src_tmp1 += 2*src_strd - 4*wd
bgt inner_loop_ht_2 @loop again
b end_loops @jumps to end
core_loop_ht_4:
lsl r7,r2,#2 @2*src_strd
lsl r12,r3,#2 @2*dst_strd
mov r11,r6,lsr #1 @divide by 2
sub lr,r12,r6,lsl #1 @2*dst_strd - 2*wd
sub r8,r7,r6,lsl #2 @2*src_strd - 4*wd
mul r12,r5,r11 @multiply height by width
sub r12,#4 @subtract by one for epilog
mov r11,r6,lsl #1 @2*wd
prolog:
add r0,r4,r2 @increments pi2_src
vld1.16 {d0},[r4]! @loads pu1_src
vld1.16 {d1},[r0],r2 @loads pi2_src
subs r11,r11,#4
vld1.16 {d2},[r0],r2 @loads pi2_src
vmull.s16 q15,d0,d12 @vmull_s16(src_tmp1, coeff_0)
vld1.16 {d3},[r0],r2
vmlal.s16 q15,d1,d13
vmlal.s16 q15,d2,d14
add r9,r1,r3 @pu1_dst + dst_strd
vmlal.s16 q15,d3,d15
vld1.16 {d4},[r0],r2
vmull.s16 q14,d1,d12 @vmull_s16(src_tmp2, coeff_0)
addle r4,r4,r8
vmlal.s16 q14,d2,d13
vld1.s16 {d5},[r0],r2
vmlal.s16 q14,d3,d14
vld1.s16 {d6},[r0],r2
vmlal.s16 q14,d4,d15
movle r11,r6,lsl #1
vqshrn.s32 d30,q15,#6 @right shift
vmull.s16 q13,d2,d12 @vmull_s16(src_tmp2, coeff_0)
add r0,r4,r2
vmlal.s16 q13,d3,d13
vmlal.s16 q13,d4,d14
vld1.16 {d0},[r4]! @loads pu1_src
vmlal.s16 q13,d5,d15
vqrshrun.s16 d30,q15,#6 @rounding shift
vqshrn.s32 d28,q14,#6 @right shift
vld1.16 {d1},[r0],r2 @loads pi2_src
vmull.s16 q12,d3,d12 @vmull_s16(src_tmp2, coeff_0)
vst1.32 {d30[0]},[r1]! @stores the loaded value
vmlal.s16 q12,d4,d13
vld1.16 {d2},[r0],r2 @loads pi2_src
vmlal.s16 q12,d5,d14
vld1.16 {d3},[r0],r2
vmlal.s16 q12,d6,d15
addle r1,r1,lr
vqshrn.s32 d26,q13,#6 @right shift
subs r12,r12,#4
vqrshrun.s16 d28,q14,#6 @rounding shift
beq epilog @jumps to epilog
kernel_4:
vmull.s16 q15,d0,d12 @vmull_s16(src_tmp1, coeff_0)
subs r11,r11,#4
vmlal.s16 q15,d1,d13
vst1.32 {d28[0]},[r9],r3 @stores the loaded value
vmlal.s16 q15,d2,d14
vmlal.s16 q15,d3,d15
vqshrn.s32 d24,q12,#6 @right shift
vqrshrun.s16 d26,q13,#6 @rounding shift
vld1.16 {d4},[r0],r2
vmull.s16 q14,d1,d12 @vmull_s16(src_tmp2, coeff_0)
vmlal.s16 q14,d2,d13
vmlal.s16 q14,d3,d14
vmlal.s16 q14,d4,d15
vst1.32 {d26[0]},[r9],r3 @stores the loaded value
addle r4,r4,r8
movle r11,r6,lsl #1
vqshrn.s32 d30,q15,#6 @right shift
vqrshrun.s16 d24,q12,#6 @rounding shift
vld1.s16 {d5},[r0],r2
vmull.s16 q13,d2,d12 @vmull_s16(src_tmp2, coeff_0)
vld1.s16 {d6},[r0],r2
vmlal.s16 q13,d3,d13
vst1.32 {d24[0]},[r9] @stores the loaded value
add r0,r4,r2
vmlal.s16 q13,d4,d14
vld1.16 {d0},[r4]! @loads pu1_src
vmlal.s16 q13,d5,d15
vqshrn.s32 d28,q14,#6 @right shift
vqrshrun.s16 d30,q15,#6 @rounding shift
vld1.16 {d1},[r0],r2 @loads pi2_src
vmull.s16 q12,d3,d12 @vmull_s16(src_tmp2, coeff_0)
add r9,r1,r3 @pu1_dst + dst_strd
vld1.16 {d2},[r0],r2 @loads pi2_src
vmlal.s16 q12,d4,d13
vld1.16 {d3},[r0],r2
vmlal.s16 q12,d5,d14
vst1.32 {d30[0]},[r1]! @stores the loaded value
vmlal.s16 q12,d6,d15
vqshrn.s32 d26,q13,#6 @right shift
vqrshrun.s16 d28,q14,#6 @rounding shift
addle r1,r1,lr
subs r12,r12,#4
bgt kernel_4 @jumps to kernel_4
epilog:
vmull.s16 q15,d0,d12 @vmull_s16(src_tmp1, coeff_0)
vst1.32 {d28[0]},[r9],r3 @stores the loaded value
vmlal.s16 q15,d1,d13
vmlal.s16 q15,d2,d14
vmlal.s16 q15,d3,d15
vqshrn.s32 d24,q12,#6 @right shift
vqrshrun.s16 d26,q13,#6 @rounding shift
vmull.s16 q14,d1,d12 @vmull_s16(src_tmp2, coeff_0)
vld1.16 {d4},[r0],r2
vmlal.s16 q14,d2,d13
vst1.32 {d26[0]},[r9],r3 @stores the loaded value
vmlal.s16 q14,d3,d14
vmlal.s16 q14,d4,d15
vqshrn.s32 d30,q15,#6 @right shift
vqrshrun.s16 d24,q12,#6 @rounding shift
vmull.s16 q13,d2,d12 @vmull_s16(src_tmp2, coeff_0)
vld1.s16 {d5},[r0],r2
vmlal.s16 q13,d3,d13
vmlal.s16 q13,d4,d14
vmlal.s16 q13,d5,d15
vqshrn.s32 d28,q14,#6 @right shift
vqrshrun.s16 d30,q15,#6 @rounding shift
vst1.32 {d24[0]},[r9] @stores the loaded value
vmull.s16 q12,d3,d12 @vmull_s16(src_tmp2, coeff_0)
vmlal.s16 q12,d4,d13
add r9,r1,r3 @pu1_dst + dst_strd
vld1.s16 {d6},[r0],r2
vmlal.s16 q12,d5,d14
vmlal.s16 q12,d6,d15
vst1.32 {d30[0]},[r1]! @stores the loaded value
vqrshrun.s16 d28,q14,#6 @rounding shift
vqshrn.s32 d26,q13,#6 @right shift
vst1.32 {d28[0]},[r9],r3 @stores the loaded value
vqrshrun.s16 d26,q13,#6 @rounding shift
vqshrn.s32 d24,q12,#6 @right shift
vst1.32 {d26[0]},[r9],r3 @stores the loaded value
vqrshrun.s16 d24,q12,#6 @rounding shift
vst1.32 {d24[0]},[r9] @stores the loaded value
end_loops:
ldmfd sp!,{r4-r12,r15} @reload the registers from sp
@/*****************************************************************************
@*
@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
@*
@* Licensed under the Apache License, Version 2.0 (the "License");
@* you may not use this file except in compliance with the License.
@* You may obtain a copy of the License at:
@*
@* http://www.apache.org/licenses/LICENSE-2.0
@*
@* Unless required by applicable law or agreed to in writing, software
@* distributed under the License is distributed on an "AS IS" BASIS,
@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@* See the License for the specific language governing permissions and
@* limitations under the License.
@*
@*****************************************************************************/
@/**
@*******************************************************************************
@* @file
@* ihevc_inter_pred_chroma_vert_neon_w16inp_w16out_neon.s
@*
@* @brief
@* contains function definitions for inter prediction interpolation.
@* functions are coded using neon intrinsics and can be compiled using
@* rvct
@*
@* @author
@* yogeswaran rs / parthiban
@*
@* @par list of functions:
@*
@*
@* @remarks
@* none
@*
@*******************************************************************************
@*/
@/**
@/**
@*******************************************************************************
@*
@* @brief
@* chroma interprediction filter for 16bit vertical input and output.
@*
@* @par description:
@* applies a vertical filter with coefficients pointed to by 'pi1_coeff' to
@* the elements pointed by 'pu1_src' and writes to the location pointed by
@* 'pu1_dst' input is 16 bits the filter output is downshifted by 6 and
@* 8192 is subtracted to store it as a 16 bit number the output is used as
@* a input to weighted prediction assumptions : the function is optimized
@* considering the fact width and height are multiple of 2.
@*
@* @param[in] pi2_src
@* word16 pointer to the source
@*
@* @param[out] pi2_dst
@* word16 pointer to the destination
@*
@* @param[in] src_strd
@* integer source stride
@*
@* @param[in] dst_strd
@* integer destination stride
@*
@* @param[in] pi1_coeff
@* word8 pointer to the filter coefficients
@*
@* @param[in] ht
@* integer height of the array
@*
@* @param[in] wd
@* integer width of the array
@*
@* @returns
@*
@* @remarks
@* none
@*
@*******************************************************************************
@*/
@void ihevc_inter_pred_chroma_vert_w16inp_w16out(word16 *pi2_src,
@ word16 *pi2_dst,
@ word32 src_strd,
@ word32 dst_strd,
@ word8 *pi1_coeff,
@ word32 ht,
@ word32 wd)
@**************variables vs registers*****************************************
@r0 => *pu1_src
@r1 => *pi2_dst
@r2 => src_strd
@r3 => dst_strd
.text
.align 4
.globl ihevc_inter_pred_chroma_vert_w16inp_w16out_a9q
.type ihevc_inter_pred_chroma_vert_w16inp_w16out_a9q, %function
ihevc_inter_pred_chroma_vert_w16inp_w16out_a9q:
stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments
ldr r4, [sp,#40] @loads pi1_coeff
ldr r6, [sp,#48] @wd
lsl r2,r2,#1 @src_strd = 2* src_strd
ldr r5,[sp,#44] @loads ht
vld1.8 {d0},[r4] @loads pi1_coeff
sub r4,r0,r2 @pu1_src - src_strd
vmovl.s8 q0,d0 @long the value
tst r6,#3 @checks wd == 2
vdup.16 d12,d0[0] @coeff_0
vdup.16 d13,d0[1] @coeff_1
vdup.16 d14,d0[2] @coeff_2
vdup.16 d15,d0[3] @coeff_3
bgt core_loop_ht_2 @jumps to loop handles wd 2
tst r5,#3 @checks ht == mul of 4
beq core_loop_ht_4 @jumps to loop handles ht mul of 4
core_loop_ht_2:
lsl r7,r2,#1 @2*src_strd
lsl r3,r3,#1 @2*dst_strd
lsl r9,r6,#2 @4*wd
sub r6,r3,r6,lsl #1 @2*dst_strd - 2*wd
sub r8,r7,r9 @2*src_strd - 4*wd
mov r12,r9 @4wd
inner_loop_ht_2:
add r0,r4,r2 @increments pi2_src
vld1.16 {d0},[r4]! @loads pu1_src
vmull.s16 q0,d0,d12 @vmull_s16(src_tmp1, coeff_0)
subs r12,r12,#8 @2wd + 8
vld1.16 {d2},[r0],r2 @loads pi2_src
vmull.s16 q4,d2,d12 @vmull_s16(src_tmp2, coeff_0)
vld1.16 {d3},[r0],r2 @loads pi2_src
vmlal.s16 q0,d2,d13
vld1.16 {d6},[r0],r2
vmlal.s16 q4,d3,d13
vld1.16 {d2},[r0]
add r7,r1,r3 @pu1_dst + dst_strd
vmlal.s16 q0,d3,d14
vmlal.s16 q4,d6,d14
vmlal.s16 q0,d6,d15
vmlal.s16 q4,d2,d15
vqshrn.s32 d0,q0,#6 @right shift
vqshrn.s32 d30,q4,#6 @right shift
vst1.32 {d0},[r1]! @stores the loaded value
vst1.32 {d30},[r7] @stores the loaded value
bgt inner_loop_ht_2 @inner loop -again
@inner loop ends
subs r5,r5,#2 @increments ht
add r1,r1,r6,lsl #1 @pu1_dst += 2*dst_strd - 2*wd
mov r12,r9 @4wd
add r4,r4,r8 @pi1_src_tmp1 += 2*src_strd - 4*wd
bgt inner_loop_ht_2 @loop again
b end_loops @jumps to end
core_loop_ht_4:
lsl r7,r2,#2 @2*src_strd
lsl r10,r3,#2 @2*dst_strd
mov r11,r6,lsr #1 @divide by 2
sub lr,r10,r6,lsl #1 @2*dst_strd - 2*wd
sub r8,r7,r6,lsl #2 @2*src_strd - 4*wd
mul r12,r5,r11 @multiply height by width
sub r12,#4 @subtract by one for epilog
mov r11,r6,lsl #1 @2*wd
lsl r3,r3,#1 @2*dst_strd
prolog:
add r0,r4,r2 @increments pi2_src
vld1.16 {d0},[r4]! @loads pu1_src
vld1.16 {d1},[r0],r2 @loads pi2_src
subs r11,r11,#4
vld1.16 {d2},[r0],r2 @loads pi2_src
vmull.s16 q15,d0,d12 @vmull_s16(src_tmp1, coeff_0)
vld1.16 {d3},[r0],r2
vmlal.s16 q15,d1,d13
vmlal.s16 q15,d2,d14
add r9,r1,r3 @pu1_dst + dst_strd
vmlal.s16 q15,d3,d15
vld1.16 {d4},[r0],r2
vmull.s16 q14,d1,d12 @vmull_s16(src_tmp2, coeff_0)
addle r4,r4,r8
movle r11,r6,lsl #1
vmlal.s16 q14,d2,d13
vmlal.s16 q14,d3,d14
vld1.s16 {d5},[r0],r2
vmlal.s16 q14,d4,d15
vqshrn.s32 d30,q15,#6 @right shift
vld1.s16 {d6},[r0],r2
vmull.s16 q13,d2,d12 @vmull_s16(src_tmp2, coeff_0)
vmlal.s16 q13,d3,d13
vmlal.s16 q13,d4,d14
add r0,r4,r2
vld1.16 {d0},[r4]! @loads pu1_src
vmlal.s16 q13,d5,d15
vqshrn.s32 d28,q14,#6 @right shift
vld1.16 {d1},[r0],r2 @loads pi2_src
vmull.s16 q12,d3,d12 @vmull_s16(src_tmp2, coeff_0)
vst1.32 {d30},[r1]! @stores the loaded value
vmlal.s16 q12,d4,d13
vld1.16 {d2},[r0],r2 @loads pi2_src
vmlal.s16 q12,d5,d14
vld1.16 {d3},[r0],r2
vmlal.s16 q12,d6,d15
addle r1,r1,lr,lsl #1
vqshrn.s32 d26,q13,#6 @right shift
subs r12,r12,#4
beq epilog @jumps to epilog
kernel_4:
vmull.s16 q15,d0,d12 @vmull_s16(src_tmp1, coeff_0)
subs r11,r11,#4
vmlal.s16 q15,d1,d13
vst1.32 {d28},[r9],r3 @stores the loaded value
vmlal.s16 q15,d2,d14
vmlal.s16 q15,d3,d15
vqshrn.s32 d24,q12,#6 @right shift
vld1.16 {d4},[r0],r2
vmull.s16 q14,d1,d12 @vmull_s16(src_tmp2, coeff_0)
vmlal.s16 q14,d2,d13
vmlal.s16 q14,d3,d14
vmlal.s16 q14,d4,d15
vst1.32 {d26},[r9],r3 @stores the loaded value
addle r4,r4,r8
movle r11,r6,lsl #1
vqshrn.s32 d30,q15,#6 @right shift
vld1.s16 {d5},[r0],r2
vmull.s16 q13,d2,d12 @vmull_s16(src_tmp2, coeff_0)
vld1.s16 {d6},[r0],r2
vmlal.s16 q13,d3,d13
vst1.32 {d24},[r9] @stores the loaded value
add r0,r4,r2
vmlal.s16 q13,d4,d14
vld1.16 {d0},[r4]! @loads pu1_src
vmlal.s16 q13,d5,d15
vqshrn.s32 d28,q14,#6 @right shift
vld1.16 {d1},[r0],r2 @loads pi2_src
vmull.s16 q12,d3,d12 @vmull_s16(src_tmp2, coeff_0)
vld1.16 {d2},[r0],r2 @loads pi2_src
vmlal.s16 q12,d4,d13
add r9,r1,r3 @pu1_dst + dst_strd
vld1.16 {d3},[r0],r2
vmlal.s16 q12,d5,d14
vst1.32 {d30},[r1]! @stores the loaded value
vmlal.s16 q12,d6,d15
vqshrn.s32 d26,q13,#6 @right shift
addle r1,r1,lr,lsl #1
subs r12,r12,#4
bgt kernel_4 @jumps to kernel_4
epilog:
vmull.s16 q15,d0,d12 @vmull_s16(src_tmp1, coeff_0)
vst1.32 {d28},[r9],r3 @stores the loaded value
vmlal.s16 q15,d1,d13
vmlal.s16 q15,d2,d14
vmlal.s16 q15,d3,d15
vqshrn.s32 d24,q12,#6 @right shift
vmull.s16 q14,d1,d12 @vmull_s16(src_tmp2, coeff_0)
vld1.16 {d4},[r0],r2
vmlal.s16 q14,d2,d13
vst1.32 {d26},[r9],r3 @stores the loaded value
vmlal.s16 q14,d3,d14
vmlal.s16 q14,d4,d15
vqshrn.s32 d30,q15,#6 @right shift
vmull.s16 q13,d2,d12 @vmull_s16(src_tmp2, coeff_0)
vld1.s16 {d5},[r0],r2
vmlal.s16 q13,d3,d13
vmlal.s16 q13,d4,d14
vmlal.s16 q13,d5,d15
vqshrn.s32 d28,q14,#6 @right shift
vst1.32 {d24},[r9] @stores the loaded value
vmull.s16 q12,d3,d12 @vmull_s16(src_tmp2, coeff_0)
vmlal.s16 q12,d4,d13
add r9,r1,r3 @pu1_dst + dst_strd
vld1.s16 {d6},[r0],r2
vmlal.s16 q12,d5,d14
vmlal.s16 q12,d6,d15
vst1.32 {d30},[r1]! @stores the loaded value
vqshrn.s32 d26,q13,#6 @right shift
vst1.32 {d28},[r9],r3 @stores the loaded value
vqshrn.s32 d24,q12,#6 @right shift
vst1.32 {d26},[r9],r3 @stores the loaded value
vst1.32 {d24},[r9] @stores the loaded value
end_loops:
ldmfd sp!,{r4-r12,r15} @reload the registers from sp
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
@/*****************************************************************************
@*
@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
@*
@* Licensed under the Apache License, Version 2.0 (the "License");
@* you may not use this file except in compliance with the License.
@* You may obtain a copy of the License at:
@*
@* http://www.apache.org/licenses/LICENSE-2.0
@*
@* Unless required by applicable law or agreed to in writing, software
@* distributed under the License is distributed on an "AS IS" BASIS,
@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@* See the License for the specific language governing permissions and
@* limitations under the License.
@*
@*****************************************************************************/
@/**
@/**
@*******************************************************************************
@*
@* @brief
@* interprediction luma function for copy
@*
@* @par description:
@* copies the array of width 'wd' and height 'ht' from the location pointed
@* by 'src' to the location pointed by 'dst'
@*
@* @param[in] pu1_src
@* uword8 pointer to the source
@*
@* @param[out] pu1_dst
@* uword8 pointer to the destination
@*
@* @param[in] src_strd
@* integer source stride
@*
@* @param[in] dst_strd
@* integer destination stride
@*
@* @param[in] pi1_coeff
@* word8 pointer to the filter coefficients
@*
@* @param[in] ht
@* integer height of the array
@*
@* @param[in] wd
@* integer width of the array
@*
@* @returns
@*
@* @remarks
@* none
@*
@*******************************************************************************
@*/
@void ihevc_inter_pred_luma_copy (
@ uword8 *pu1_src,
@ uword8 *pu1_dst,
@ word32 src_strd,
@ word32 dst_strd,
@ word8 *pi1_coeff,
@ word32 ht,
@ word32 wd )
@**************variables vs registers*****************************************
@ r0 => *pu1_src
@ r1 => *pu1_dst
@ r2 => src_strd
@ r3 => dst_strd
@ r7 => ht
@ r12 => wd
.text
.align 4
.globl ihevc_inter_pred_luma_copy_a9q
.type ihevc_inter_pred_luma_copy_a9q, %function
ihevc_inter_pred_luma_copy_a9q:
stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments
ldr r12,[sp,#48] @loads wd
ldr r7,[sp,#44] @loads ht
cmp r7,#0 @checks ht == 0
ble end_loops
tst r12,#15 @checks wd for multiples for 4 & 8
beq core_loop_wd_16
tst r12,#7 @checks wd for multiples for 4 & 8
beq core_loop_wd_8
sub r11,r12,#4
outer_loop_wd_4:
subs r4,r12,#0 @checks wd == 0
ble end_inner_loop_wd_4
inner_loop_wd_4:
vld1.32 {d0[0]},[r0] @vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
add r5,r0,r2 @pu1_src_tmp += src_strd
add r6,r1,r3 @pu1_dst_tmp += dst_strd
vst1.32 {d0[0]},[r1] @vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
vld1.32 {d0[0]},[r5],r2 @vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
add r0,r0,#4 @pu1_src += 4
vst1.32 {d0[0]},[r6],r3 @vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
vld1.32 {d0[0]},[r5],r2 @vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
subs r4,r4,#4 @(wd -4)
vst1.32 {d0[0]},[r6],r3 @vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
vld1.32 {d0[0]},[r5],r2 @vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
add r1,r1,#4 @pu1_dst += 4
vst1.32 {d0[0]},[r6],r3 @vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
bgt inner_loop_wd_4
end_inner_loop_wd_4:
subs r7,r7,#4 @ht - 4
sub r0,r5,r11 @pu1_src = pu1_src_tmp
sub r1,r6,r11 @pu1_dst = pu1_dst_tmp
bgt outer_loop_wd_4
end_loops:
ldmfd sp!,{r4-r12,r15} @reload the registers from sp
core_loop_wd_8:
sub r11,r12,#8
outer_loop_wd_8:
subs r4,r12,#0 @checks wd
ble end_inner_loop_wd_8
inner_loop_wd_8:
add r5,r0,r2 @pu1_src_tmp += src_strd
vld1.8 {d0},[r0]! @vld1_u8(pu1_src_tmp)
add r6,r1,r3 @pu1_dst_tmp += dst_strd
vst1.8 {d0},[r1]! @vst1_u8(pu1_dst_tmp, tmp_src)
vld1.8 {d1},[r5],r2 @vld1_u8(pu1_src_tmp)
vst1.8 {d1},[r6],r3 @vst1_u8(pu1_dst_tmp, tmp_src)
subs r4,r4,#8 @wd - 8(loop condition)
vld1.8 {d2},[r5],r2 @vld1_u8(pu1_src_tmp)
vst1.8 {d2},[r6],r3 @vst1_u8(pu1_dst_tmp, tmp_src)
vld1.8 {d3},[r5],r2 @vld1_u8(pu1_src_tmp)
vst1.8 {d3},[r6],r3 @vst1_u8(pu1_dst_tmp, tmp_src)
bgt inner_loop_wd_8
end_inner_loop_wd_8:
subs r7,r7,#4 @ht -= 4
sub r0,r5,r11 @pu1_src = pu1_src_tmp
sub r1,r6,r11 @pu1_dst = pu1_dst_tmp
bgt outer_loop_wd_8
ldmfd sp!,{r4-r12,r15} @reload the registers from sp
core_loop_wd_16:
sub r11,r12,#16
outer_loop_wd_16:
subs r4,r12,#0 @checks wd
ble end_inner_loop_wd_16
inner_loop_wd_16:
add r5,r0,r2 @pu1_src_tmp += src_strd
vld1.8 {q0},[r0]! @vld1_u8(pu1_src_tmp)
add r6,r1,r3 @pu1_dst_tmp += dst_strd
vst1.8 {q0},[r1]! @vst1_u8(pu1_dst_tmp, tmp_src)
vld1.8 {q1},[r5],r2 @vld1_u8(pu1_src_tmp)
vst1.8 {q1},[r6],r3 @vst1_u8(pu1_dst_tmp, tmp_src)
subs r4,r4,#16 @wd - 8(loop condition)
vld1.8 {q2},[r5],r2 @vld1_u8(pu1_src_tmp)
vst1.8 {q2},[r6],r3 @vst1_u8(pu1_dst_tmp, tmp_src)
vld1.8 {q3},[r5],r2 @vld1_u8(pu1_src_tmp)
vst1.8 {q3},[r6],r3 @vst1_u8(pu1_dst_tmp, tmp_src)
bgt inner_loop_wd_16
end_inner_loop_wd_16:
subs r7,r7,#4 @ht -= 4
sub r0,r5,r11 @pu1_src = pu1_src_tmp
sub r1,r6,r11 @pu1_dst = pu1_dst_tmp
bgt outer_loop_wd_16
ldmfd sp!,{r4-r12,r15} @reload the registers from sp
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment