removed x86 specific stuff from cmake, added arm64 intrinsic functions. cmake should now work on all platforms.
This commit is contained in:
parent
068ffc674c
commit
96bbc6c07e
10 changed files with 1527 additions and 43 deletions
|
@ -4,8 +4,21 @@ project(liby262)
|
|||
find_package(Threads)
|
||||
find_program(YASM_EXE NAMES yasm)
|
||||
|
||||
if(CMAKE_SIZEOF_VOID_P EQUAL 8)
|
||||
set(ARCH "_x64")
|
||||
message( "architecture: ${CMAKE_SYSTEM_PROCESSOR}")
|
||||
|
||||
set(Y262_TARGET_ARCH "unknown")
|
||||
if(CMAKE_SYSTEM_PROCESSOR MATCHES "amd64.*|x86_64.*|AMD64.*")
|
||||
set(Y262_TARGET_ARCH "intelx86")
|
||||
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "i686.*|x86.*|i386.*")
|
||||
set(Y262_TARGET_ARCH "intelx86")
|
||||
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(arm64.*|aarch64.*|ARM64.*|AARCH64.*)")
|
||||
set(Y262_TARGET_ARCH "arm64")
|
||||
endif()
|
||||
|
||||
message( "target_arch: ${Y262_TARGET_ARCH}")
|
||||
|
||||
if(Y262_TARGET_ARCH MATCHES "intelx86")
|
||||
if(CMAKE_SIZEOF_VOID_P EQUAL 8)
|
||||
if(WIN32)
|
||||
set(YASM_ARGS -f win32 -m amd64 -DARCH_X86_64 -DPIC)
|
||||
elseif(APPLE)
|
||||
|
@ -13,8 +26,7 @@ if(CMAKE_SIZEOF_VOID_P EQUAL 8)
|
|||
else()
|
||||
set(YASM_ARGS -f elf64 -m amd64 -DARCH_X86_64 -DPIC)
|
||||
endif()
|
||||
else()
|
||||
set(ARCH "_x86")
|
||||
else()
|
||||
if(WIN32)
|
||||
set(YASM_ARGS -f win32 --prefix=_)
|
||||
elseif(APPLE)
|
||||
|
@ -22,14 +34,15 @@ else()
|
|||
else()
|
||||
set(YASM_ARGS -f elf32)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
add_custom_command(OUTPUT pixelop_x86.o COMMAND ${YASM_EXE}
|
||||
ARGS ${YASM_ARGS} -o ${CMAKE_CURRENT_BINARY_DIR}/pixelop_x86.o ${CMAKE_CURRENT_SOURCE_DIR}/pixelop_x86.asm)
|
||||
add_custom_command(OUTPUT transform_x86.o COMMAND ${YASM_EXE}
|
||||
ARGS ${YASM_ARGS} -o ${CMAKE_CURRENT_BINARY_DIR}/transform_x86.o ${CMAKE_CURRENT_SOURCE_DIR}/transform_x86.asm)
|
||||
endif()
|
||||
|
||||
add_custom_command(OUTPUT pixelop_x86.o COMMAND ${YASM_EXE}
|
||||
ARGS ${YASM_ARGS} -o ${CMAKE_CURRENT_BINARY_DIR}/pixelop_x86.o ${CMAKE_CURRENT_SOURCE_DIR}/pixelop_x86.asm)
|
||||
add_custom_command(OUTPUT transform_x86.o COMMAND ${YASM_EXE}
|
||||
ARGS ${YASM_ARGS} -o ${CMAKE_CURRENT_BINARY_DIR}/transform_x86.o ${CMAKE_CURRENT_SOURCE_DIR}/transform_x86.asm)
|
||||
|
||||
add_library(liby262 STATIC
|
||||
set(liby262_sources_basic
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/aboveslicelevel.h
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/bitstream.h
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/lookahead.h
|
||||
|
@ -56,17 +69,36 @@ add_library(liby262 STATIC
|
|||
${CMAKE_CURRENT_SOURCE_DIR}/transform.c
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/y262.c
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/y262api.c
|
||||
)
|
||||
|
||||
set(liby262_sources_assembly "")
|
||||
|
||||
if(Y262_TARGET_ARCH MATCHES "intelx86")
|
||||
set(liby262_sources_assembly
|
||||
${CMAKE_CURRENT_BINARY_DIR}/pixelop_x86.o
|
||||
${CMAKE_CURRENT_BINARY_DIR}/transform_x86.o
|
||||
)
|
||||
elseif(Y262_TARGET_ARCH MATCHES "arm64")
|
||||
set(liby262_sources_assembly
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/transform_arm64.c
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/transform_arm64.h
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/pixelop_arm64.c
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/pixelop_arm64.h
|
||||
)
|
||||
endif()
|
||||
|
||||
add_library(liby262 STATIC
|
||||
${liby262_sources_basic}
|
||||
${liby262_sources_assembly}
|
||||
)
|
||||
|
||||
set_target_properties(liby262 PROPERTIES
|
||||
OUTPUT_NAME "liby262$<$<CONFIG:Debug>:d>${ARCH}"
|
||||
ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib"
|
||||
LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib"
|
||||
)
|
||||
target_include_directories(liby262 PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
|
||||
|
||||
if(Y262_TARGET_ARCH MATCHES "intelx86")
|
||||
add_compile_definitions(ASSEMBLY_X86)
|
||||
elseif(Y262_TARGET_ARCH MATCHES "arm64")
|
||||
add_compile_definitions(ASSEMBLY_ARM64)
|
||||
endif()
|
||||
|
||||
if(WIN32)
|
||||
target_compile_definitions(liby262 PRIVATE WIN32)
|
||||
|
@ -77,8 +109,12 @@ else()
|
|||
target_link_libraries(liby262 PUBLIC m)
|
||||
endif()
|
||||
|
||||
set_target_properties(liby262 PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||
set_target_properties(liby262 PROPERTIES
|
||||
OUTPUT_NAME "liby262$<$<CONFIG:Debug>:d>${MY_ARCH}"
|
||||
ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib"
|
||||
LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib"
|
||||
POSITION_INDEPENDENT_CODE ON
|
||||
)
|
||||
|
||||
target_include_directories(liby262 PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
|
||||
|
||||
target_link_libraries(liby262 PUBLIC Threads::Threads)
|
||||
|
|
127
src/y262/mc.c
127
src/y262/mc.c
|
@ -30,6 +30,8 @@ POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#include "y262.h"
|
||||
|
||||
#ifdef ASSEMBLY_X86
|
||||
|
||||
void y262_motcomp_16x16_00_put_sse2( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride );
|
||||
void y262_motcomp_16x16_01_put_sse2( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride );
|
||||
void y262_motcomp_16x16_10_put_sse2( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride );
|
||||
|
@ -80,6 +82,65 @@ void y262_motcomp_8x4_01_avg_mmxext( uint8_t *pui8_src, int32_t i_src_stride, ui
|
|||
void y262_motcomp_8x4_10_avg_mmxext( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride );
|
||||
void y262_motcomp_8x4_11_avg_mmxext( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride );
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
#ifdef ASSEMBLY_ARM64
|
||||
|
||||
void y262_motcomp_16x16_00_put_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride );
|
||||
void y262_motcomp_16x16_01_put_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride );
|
||||
void y262_motcomp_16x16_10_put_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride );
|
||||
void y262_motcomp_16x16_11_put_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride );
|
||||
|
||||
void y262_motcomp_16x8_00_put_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride );
|
||||
void y262_motcomp_16x8_01_put_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride );
|
||||
void y262_motcomp_16x8_10_put_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride );
|
||||
void y262_motcomp_16x8_11_put_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride );
|
||||
|
||||
void y262_motcomp_8x16_00_put_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride );
|
||||
void y262_motcomp_8x16_01_put_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride );
|
||||
void y262_motcomp_8x16_10_put_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride );
|
||||
void y262_motcomp_8x16_11_put_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride );
|
||||
|
||||
void y262_motcomp_8x8_00_put_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride );
|
||||
void y262_motcomp_8x8_01_put_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride );
|
||||
void y262_motcomp_8x8_10_put_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride );
|
||||
void y262_motcomp_8x8_11_put_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride );
|
||||
|
||||
void y262_motcomp_8x4_00_put_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride );
|
||||
void y262_motcomp_8x4_01_put_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride );
|
||||
void y262_motcomp_8x4_10_put_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride );
|
||||
void y262_motcomp_8x4_11_put_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride );
|
||||
|
||||
void y262_motcomp_16x16_00_avg_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride );
|
||||
void y262_motcomp_16x16_01_avg_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride );
|
||||
void y262_motcomp_16x16_10_avg_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride );
|
||||
void y262_motcomp_16x16_11_avg_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride );
|
||||
|
||||
void y262_motcomp_16x8_00_avg_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride );
|
||||
void y262_motcomp_16x8_01_avg_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride );
|
||||
void y262_motcomp_16x8_10_avg_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride );
|
||||
void y262_motcomp_16x8_11_avg_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride );
|
||||
|
||||
void y262_motcomp_8x16_00_avg_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride );
|
||||
void y262_motcomp_8x16_01_avg_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride );
|
||||
void y262_motcomp_8x16_10_avg_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride );
|
||||
void y262_motcomp_8x16_11_avg_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride );
|
||||
|
||||
void y262_motcomp_8x8_00_avg_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride );
|
||||
void y262_motcomp_8x8_01_avg_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride );
|
||||
void y262_motcomp_8x8_10_avg_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride );
|
||||
void y262_motcomp_8x8_11_avg_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride );
|
||||
|
||||
void y262_motcomp_8x4_00_avg_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride );
|
||||
void y262_motcomp_8x4_01_avg_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride );
|
||||
void y262_motcomp_8x4_10_avg_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride );
|
||||
void y262_motcomp_8x4_11_avg_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride );
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
@ -322,7 +383,7 @@ void y262_init_motion_compensation( y262_t *ps_y262 )
|
|||
ps_y262->s_funcs.rgf_motcomp_avg[ MC_BLOCK_8x4 ][ MC_BLOCK_10 ] = y262_motcomp_8x4_10_avg;
|
||||
ps_y262->s_funcs.rgf_motcomp_avg[ MC_BLOCK_8x4 ][ MC_BLOCK_11 ] = y262_motcomp_8x4_11_avg;
|
||||
|
||||
#if 1
|
||||
#ifdef ASSEMBLY_X86
|
||||
|
||||
if( 1 )
|
||||
{
|
||||
|
@ -382,6 +443,70 @@ void y262_init_motion_compensation( y262_t *ps_y262 )
|
|||
ps_y262->s_funcs.rgf_motcomp_avg[ MC_BLOCK_8x4 ][ MC_BLOCK_11 ] = y262_motcomp_8x4_11_avg_mmxext;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef ASSEMBLY_ARM64
|
||||
|
||||
if( 1 )
|
||||
{
|
||||
/* copy */
|
||||
ps_y262->s_funcs.rgf_motcomp_copy[ MC_BLOCK_16x16 ][ MC_BLOCK_00 ] = y262_motcomp_16x16_00_put_neon;
|
||||
ps_y262->s_funcs.rgf_motcomp_copy[ MC_BLOCK_16x16 ][ MC_BLOCK_01 ] = y262_motcomp_16x16_01_put_neon;
|
||||
ps_y262->s_funcs.rgf_motcomp_copy[ MC_BLOCK_16x16 ][ MC_BLOCK_10 ] = y262_motcomp_16x16_10_put_neon;
|
||||
ps_y262->s_funcs.rgf_motcomp_copy[ MC_BLOCK_16x16 ][ MC_BLOCK_11 ] = y262_motcomp_16x16_11_put_neon;
|
||||
|
||||
ps_y262->s_funcs.rgf_motcomp_copy[ MC_BLOCK_16x8 ][ MC_BLOCK_00 ] = y262_motcomp_16x8_00_put_neon;
|
||||
ps_y262->s_funcs.rgf_motcomp_copy[ MC_BLOCK_16x8 ][ MC_BLOCK_01 ] = y262_motcomp_16x8_01_put_neon;
|
||||
ps_y262->s_funcs.rgf_motcomp_copy[ MC_BLOCK_16x8 ][ MC_BLOCK_10 ] = y262_motcomp_16x8_10_put_neon;
|
||||
ps_y262->s_funcs.rgf_motcomp_copy[ MC_BLOCK_16x8 ][ MC_BLOCK_11 ] = y262_motcomp_16x8_11_put_neon;
|
||||
|
||||
ps_y262->s_funcs.rgf_motcomp_copy[ MC_BLOCK_8x16 ][ MC_BLOCK_00 ] = y262_motcomp_8x16_00_put_neon;
|
||||
ps_y262->s_funcs.rgf_motcomp_copy[ MC_BLOCK_8x16 ][ MC_BLOCK_01 ] = y262_motcomp_8x16_01_put_neon;
|
||||
ps_y262->s_funcs.rgf_motcomp_copy[ MC_BLOCK_8x16 ][ MC_BLOCK_10 ] = y262_motcomp_8x16_10_put_neon;
|
||||
ps_y262->s_funcs.rgf_motcomp_copy[ MC_BLOCK_8x16 ][ MC_BLOCK_11 ] = y262_motcomp_8x16_11_put_neon;
|
||||
|
||||
ps_y262->s_funcs.rgf_motcomp_copy[ MC_BLOCK_8x8 ][ MC_BLOCK_00 ] = y262_motcomp_8x8_00_put_neon;
|
||||
ps_y262->s_funcs.rgf_motcomp_copy[ MC_BLOCK_8x8 ][ MC_BLOCK_01 ] = y262_motcomp_8x8_01_put_neon;
|
||||
ps_y262->s_funcs.rgf_motcomp_copy[ MC_BLOCK_8x8 ][ MC_BLOCK_10 ] = y262_motcomp_8x8_10_put_neon;
|
||||
ps_y262->s_funcs.rgf_motcomp_copy[ MC_BLOCK_8x8 ][ MC_BLOCK_11 ] = y262_motcomp_8x8_11_put_neon;
|
||||
|
||||
ps_y262->s_funcs.rgf_motcomp_copy[ MC_BLOCK_8x4 ][ MC_BLOCK_00 ] = y262_motcomp_8x4_00_put_neon;
|
||||
ps_y262->s_funcs.rgf_motcomp_copy[ MC_BLOCK_8x4 ][ MC_BLOCK_01 ] = y262_motcomp_8x4_01_put_neon;
|
||||
ps_y262->s_funcs.rgf_motcomp_copy[ MC_BLOCK_8x4 ][ MC_BLOCK_10 ] = y262_motcomp_8x4_10_put_neon;
|
||||
ps_y262->s_funcs.rgf_motcomp_copy[ MC_BLOCK_8x4 ][ MC_BLOCK_11 ] = y262_motcomp_8x4_11_put_neon;
|
||||
|
||||
|
||||
|
||||
/* avg */
|
||||
ps_y262->s_funcs.rgf_motcomp_avg[ MC_BLOCK_16x16 ][ MC_BLOCK_00 ] = y262_motcomp_16x16_00_avg_neon;
|
||||
ps_y262->s_funcs.rgf_motcomp_avg[ MC_BLOCK_16x16 ][ MC_BLOCK_01 ] = y262_motcomp_16x16_01_avg_neon;
|
||||
ps_y262->s_funcs.rgf_motcomp_avg[ MC_BLOCK_16x16 ][ MC_BLOCK_10 ] = y262_motcomp_16x16_10_avg_neon;
|
||||
ps_y262->s_funcs.rgf_motcomp_avg[ MC_BLOCK_16x16 ][ MC_BLOCK_11 ] = y262_motcomp_16x16_11_avg_neon;
|
||||
|
||||
ps_y262->s_funcs.rgf_motcomp_avg[ MC_BLOCK_16x8 ][ MC_BLOCK_00 ] = y262_motcomp_16x8_00_avg_neon;
|
||||
ps_y262->s_funcs.rgf_motcomp_avg[ MC_BLOCK_16x8 ][ MC_BLOCK_01 ] = y262_motcomp_16x8_01_avg_neon;
|
||||
ps_y262->s_funcs.rgf_motcomp_avg[ MC_BLOCK_16x8 ][ MC_BLOCK_10 ] = y262_motcomp_16x8_10_avg_neon;
|
||||
ps_y262->s_funcs.rgf_motcomp_avg[ MC_BLOCK_16x8 ][ MC_BLOCK_11 ] = y262_motcomp_16x8_11_avg_neon;
|
||||
|
||||
ps_y262->s_funcs.rgf_motcomp_avg[ MC_BLOCK_8x16 ][ MC_BLOCK_00 ] = y262_motcomp_8x16_00_avg_neon;
|
||||
ps_y262->s_funcs.rgf_motcomp_avg[ MC_BLOCK_8x16 ][ MC_BLOCK_01 ] = y262_motcomp_8x16_01_avg_neon;
|
||||
ps_y262->s_funcs.rgf_motcomp_avg[ MC_BLOCK_8x16 ][ MC_BLOCK_10 ] = y262_motcomp_8x16_10_avg_neon;
|
||||
ps_y262->s_funcs.rgf_motcomp_avg[ MC_BLOCK_8x16 ][ MC_BLOCK_11 ] = y262_motcomp_8x16_11_avg_neon;
|
||||
|
||||
ps_y262->s_funcs.rgf_motcomp_avg[ MC_BLOCK_8x8 ][ MC_BLOCK_00 ] = y262_motcomp_8x8_00_avg_neon;
|
||||
ps_y262->s_funcs.rgf_motcomp_avg[ MC_BLOCK_8x8 ][ MC_BLOCK_01 ] = y262_motcomp_8x8_01_avg_neon;
|
||||
ps_y262->s_funcs.rgf_motcomp_avg[ MC_BLOCK_8x8 ][ MC_BLOCK_10 ] = y262_motcomp_8x8_10_avg_neon;
|
||||
ps_y262->s_funcs.rgf_motcomp_avg[ MC_BLOCK_8x8 ][ MC_BLOCK_11 ] = y262_motcomp_8x8_11_avg_neon;
|
||||
|
||||
ps_y262->s_funcs.rgf_motcomp_avg[ MC_BLOCK_8x4 ][ MC_BLOCK_00 ] = y262_motcomp_8x4_00_avg_neon;
|
||||
ps_y262->s_funcs.rgf_motcomp_avg[ MC_BLOCK_8x4 ][ MC_BLOCK_01 ] = y262_motcomp_8x4_01_avg_neon;
|
||||
ps_y262->s_funcs.rgf_motcomp_avg[ MC_BLOCK_8x4 ][ MC_BLOCK_10 ] = y262_motcomp_8x4_10_avg_neon;
|
||||
ps_y262->s_funcs.rgf_motcomp_avg[ MC_BLOCK_8x4 ][ MC_BLOCK_11 ] = y262_motcomp_8x4_11_avg_neon;
|
||||
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
|
|
@ -226,6 +226,8 @@ int32_t y262_satd_16x8( uint8_t *pui8_blk1, int32_t i_stride1, uint8_t *pui8_blk
|
|||
return i_satd;
|
||||
}
|
||||
|
||||
#ifdef ASSEMBLY_X86
|
||||
|
||||
int32_t y262_satd_16x16_sse2( uint8_t *pui8_blk1, int32_t i_stride1, uint8_t *pui8_blk2, int32_t i_stride2 )
|
||||
{
|
||||
int32_t i_satd;
|
||||
|
@ -249,6 +251,7 @@ int32_t y262_satd_16x8_sse2( uint8_t *pui8_blk1, int32_t i_stride1, uint8_t *pui
|
|||
return i_satd;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
int32_t y262_ssd_16x16( uint8_t *pui8_blk1, int32_t i_blk1_stride, uint8_t *pui8_blk2, int32_t i_blk2_stride )
|
||||
{
|
||||
|
|
773
src/y262/pixelop_arm64.c
Normal file
773
src/y262/pixelop_arm64.c
Normal file
|
@ -0,0 +1,773 @@
|
|||
/*
|
||||
Copyright (c) 2013, Ralf Willenbacher
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions
|
||||
are met:
|
||||
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
|
||||
FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
|
||||
COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
|
||||
INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
|
||||
BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
||||
LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
|
||||
ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include <arm_neon.h>
|
||||
|
||||
#include "y262.h"
|
||||
|
||||
int32_t y262_sad_16x8_neon( uint8_t *pui8_blk1, int32_t i_stride1, uint8_t *pui8_blk2, int32_t i_stride2 )
|
||||
{
|
||||
int32_t i_sad, i_y;
|
||||
int64_t i64_sad;
|
||||
uint8x16_t v16_blk1, v16_blk2;
|
||||
uint8x8_t v8_a, v8_b;
|
||||
uint16x8_t v16_res0, v16_res1;
|
||||
uint32x4_t v16_hadd0;
|
||||
uint32x2_t v8_hadd1;
|
||||
uint64x1_t v8_hadd2;
|
||||
|
||||
v16_blk1 = vld1q_u8 ( pui8_blk1 );
|
||||
pui8_blk1 += i_stride1;
|
||||
v16_blk2 = vld1q_u8 ( pui8_blk2 );
|
||||
pui8_blk2 += i_stride2;
|
||||
v8_a = vget_low_u8( v16_blk1 );
|
||||
v8_b = vget_low_u8( v16_blk2 );
|
||||
v16_res0 = vabdl_u8 ( v8_a, v8_b );
|
||||
v8_a = vget_high_u8( v16_blk1 );
|
||||
v8_b = vget_high_u8( v16_blk2 );
|
||||
v16_res1 = vabdl_u8 ( v8_a, v8_b );
|
||||
|
||||
for( i_y = 1; i_y < 8; i_y++ )
|
||||
{
|
||||
v16_blk1 = vld1q_u8 ( pui8_blk1 );
|
||||
pui8_blk1 += i_stride1;
|
||||
v16_blk2 = vld1q_u8 ( pui8_blk2 );
|
||||
pui8_blk2 += i_stride2;
|
||||
v8_a = vget_low_u8( v16_blk1 );
|
||||
v8_b = vget_low_u8( v16_blk2 );
|
||||
v16_res0 = vabal_u8 ( v16_res0, v8_a, v8_b );
|
||||
v8_a = vget_high_u8( v16_blk1 );
|
||||
v8_b = vget_high_u8( v16_blk2 );
|
||||
v16_res1 = vabal_u8 ( v16_res1, v8_a, v8_b );
|
||||
}
|
||||
|
||||
v16_res0 = vaddq_u16( v16_res0, v16_res1 );
|
||||
v16_hadd0 = vpaddlq_u16( v16_res0 );
|
||||
v8_hadd1 = vadd_u32( vget_low_u32( v16_hadd0 ), vget_high_u32( v16_hadd0 ) );
|
||||
v8_hadd2 = vpaddl_u32( v8_hadd1 );
|
||||
|
||||
i64_sad = vget_lane_u64( v8_hadd2, 0 );
|
||||
i_sad = ( int32_t )i64_sad;
|
||||
|
||||
return i_sad;
|
||||
}
|
||||
|
||||
|
||||
int32_t y262_sad_16x16_neon( uint8_t *pui8_blk1, int32_t i_stride1, uint8_t *pui8_blk2, int32_t i_stride2 )
|
||||
{
|
||||
int32_t i_sad, i_y;
|
||||
int64_t i64_sad;
|
||||
uint8x16_t v16_blk1, v16_blk2;
|
||||
uint8x8_t v8_a, v8_b;
|
||||
uint16x8_t v16_res0, v16_res1;
|
||||
uint32x4_t v16_hadd0;
|
||||
uint32x2_t v8_hadd1;
|
||||
uint64x1_t v8_hadd2;
|
||||
|
||||
v16_blk1 = vld1q_u8 ( pui8_blk1 );
|
||||
pui8_blk1 += i_stride1;
|
||||
v16_blk2 = vld1q_u8 ( pui8_blk2 );
|
||||
pui8_blk2 += i_stride2;
|
||||
v8_a = vget_low_u8( v16_blk1 );
|
||||
v8_b = vget_low_u8( v16_blk2 );
|
||||
v16_res0 = vabdl_u8 ( v8_a, v8_b );
|
||||
v8_a = vget_high_u8( v16_blk1 );
|
||||
v8_b = vget_high_u8( v16_blk2 );
|
||||
v16_res1 = vabdl_u8 ( v8_a, v8_b );
|
||||
|
||||
for( i_y = 1; i_y < 16; i_y++ )
|
||||
{
|
||||
v16_blk1 = vld1q_u8 ( pui8_blk1 );
|
||||
pui8_blk1 += i_stride1;
|
||||
v16_blk2 = vld1q_u8 ( pui8_blk2 );
|
||||
pui8_blk2 += i_stride2;
|
||||
v8_a = vget_low_u8( v16_blk1 );
|
||||
v8_b = vget_low_u8( v16_blk2 );
|
||||
v16_res0 = vabal_u8 ( v16_res0, v8_a, v8_b );
|
||||
v8_a = vget_high_u8( v16_blk1 );
|
||||
v8_b = vget_high_u8( v16_blk2 );
|
||||
v16_res1 = vabal_u8 ( v16_res1, v8_a, v8_b );
|
||||
}
|
||||
|
||||
v16_res0 = vaddq_u16( v16_res0, v16_res1 );
|
||||
v16_hadd0 = vpaddlq_u16( v16_res0 );
|
||||
v8_hadd1 = vadd_u32( vget_low_u32( v16_hadd0 ), vget_high_u32( v16_hadd0 ) );
|
||||
v8_hadd2 = vpaddl_u32( v8_hadd1 );
|
||||
|
||||
i64_sad = vget_lane_u64( v8_hadd2, 0 );
|
||||
i_sad = ( int32_t )i64_sad;
|
||||
|
||||
return i_sad;
|
||||
}
|
||||
|
||||
|
||||
#define HADAMARD_NEON_4x2( d0, d1, d2, d3 ) \
|
||||
{ \
|
||||
int32x4x2_t v32_a0, v32_b0; \
|
||||
int16x8x2_t v32_a1, v32_b1; \
|
||||
\
|
||||
d0 = vaddq_s16( d0, d1 ); \
|
||||
d2 = vaddq_s16( d2, d3 ); \
|
||||
\
|
||||
d1 = vaddq_s16( d1, d1 ); \
|
||||
d3 = vaddq_s16( d3, d3 ); \
|
||||
d1 = vsubq_s16( d1, d0 ); \
|
||||
d3 = vsubq_s16( d3, d2 ); \
|
||||
\
|
||||
d0 = vaddq_s16( d0, d2 ); \
|
||||
d1 = vaddq_s16( d1, d3 ); \
|
||||
\
|
||||
d2 = vaddq_s16( d2, d2 ); \
|
||||
d3 = vaddq_s16( d3, d3 ); \
|
||||
d2 = vsubq_s16( d2, d0 ); \
|
||||
d3 = vsubq_s16( d3, d1 ); \
|
||||
\
|
||||
v32_a0 = vtrnq_s32( vreinterpretq_s32_s16( d0 ), vreinterpretq_s32_s16( d2 ) ); \
|
||||
v32_b0 = vtrnq_s32( vreinterpretq_s32_s16( d1 ), vreinterpretq_s32_s16( d3 ) ); \
|
||||
v32_a1 = vtrnq_s16( vreinterpretq_s16_s32( v32_a0.val[ 0 ] ), vreinterpretq_s16_s32( v32_b0.val[ 0 ] ) ); \
|
||||
v32_b1 = vtrnq_s16( vreinterpretq_s16_s32( v32_a0.val[ 1 ] ), vreinterpretq_s16_s32( v32_b0.val[ 1 ] ) ); \
|
||||
d0 = vcombine_s16( vget_low_s16( v32_a1.val[ 0 ] ), vget_high_s16( v32_a1.val[ 0 ] ) ); \
|
||||
d1 = vcombine_s16( vget_low_s16( v32_a1.val[ 1 ] ), vget_high_s16( v32_a1.val[ 1 ] ) ); \
|
||||
d2 = vcombine_s16( vget_low_s16( v32_b1.val[ 1 ] ), vget_high_s16( v32_b1.val[ 1 ] ) ); \
|
||||
d3 = vcombine_s16( vget_low_s16( v32_b1.val[ 0 ] ), vget_high_s16( v32_b1.val[ 0 ] ) ); \
|
||||
\
|
||||
d0 = vaddq_s16( d0, d1 ); \
|
||||
d2 = vaddq_s16( d2, d3 ); \
|
||||
\
|
||||
d1 = vaddq_s16( d1, d1 ); \
|
||||
d3 = vaddq_s16( d3, d3 ); \
|
||||
d1 = vsubq_s16( d1, d0 ); \
|
||||
d3 = vsubq_s16( d3, d2 ); \
|
||||
\
|
||||
d0 = vaddq_s16( d0, d2 ); \
|
||||
d1 = vaddq_s16( d1, d3 ); \
|
||||
\
|
||||
d2 = vaddq_s16( d2, d2 ); \
|
||||
d3 = vaddq_s16( d3, d3 ); \
|
||||
d2 = vsubq_s16( d2, d0 ); \
|
||||
d3 = vsubq_s16( d3, d1 ); \
|
||||
}
|
||||
|
||||
#define ADDSUB( d0, d1 ) \
|
||||
{ \
|
||||
int16x8_t v16_a, v16_s; \
|
||||
v16_a = vaddq_s16( d0, d1 ); \
|
||||
v16_s = vsubq_s16( d1, d0 ); \
|
||||
d0 = v16_a; \
|
||||
d1 = v16_s; \
|
||||
}
|
||||
|
||||
|
||||
#define ABSADDL( sum, vector0, vector1 ) \
|
||||
{ \
|
||||
vector0 = vabsq_s16( vector0 ); \
|
||||
vector1 = vabsq_s16( vector1 ); \
|
||||
sum = vaddq_s32( sum, vaddl_s16( vget_low_s16( vector0 ), vget_high_s16( vector0 ) ) ); \
|
||||
sum = vaddq_s32( sum, vaddl_s16( vget_low_s16( vector1 ), vget_high_s16( vector1 ) ) ); \
|
||||
}
|
||||
|
||||
int32_t y262_satd_8x8_neon( uint8_t *pui8_blk1, int32_t i_stride1, uint8_t *pui8_blk2, int32_t i_stride2 )
|
||||
{
|
||||
int32_t i_satd;
|
||||
int64_t i64_satd;
|
||||
int16x8_t v16_d0, v16_d1, v16_d2, v16_d3, v16_d4, v16_d5, v16_d6, v16_d7;
|
||||
int16x8_t v16_a0, v16_a1, v16_a2, v16_a3;
|
||||
int32x4_t v16_res;
|
||||
int32x2_t v8_hadd0;
|
||||
int64x1_t v8_hadd1;
|
||||
|
||||
v16_res = vmovq_n_s32( 0 );
|
||||
|
||||
v16_d0 = vreinterpretq_s16_u16( vsubl_u8( vld1_u8( pui8_blk1 + i_stride1 * 0 ), vld1_u8( pui8_blk2 + i_stride2 * 0 ) ) );
|
||||
v16_d1 = vreinterpretq_s16_u16( vsubl_u8( vld1_u8( pui8_blk1 + i_stride1 * 1 ), vld1_u8( pui8_blk2 + i_stride2 * 1 ) ) );
|
||||
v16_d2 = vreinterpretq_s16_u16( vsubl_u8( vld1_u8( pui8_blk1 + i_stride1 * 2 ), vld1_u8( pui8_blk2 + i_stride2 * 2 ) ) );
|
||||
v16_d3 = vreinterpretq_s16_u16( vsubl_u8( vld1_u8( pui8_blk1 + i_stride1 * 3 ), vld1_u8( pui8_blk2 + i_stride2 * 3 ) ) );
|
||||
v16_d4 = vreinterpretq_s16_u16( vsubl_u8( vld1_u8( pui8_blk1 + i_stride1 * 4 ), vld1_u8( pui8_blk2 + i_stride2 * 4 ) ) );
|
||||
v16_d5 = vreinterpretq_s16_u16( vsubl_u8( vld1_u8( pui8_blk1 + i_stride1 * 5 ), vld1_u8( pui8_blk2 + i_stride2 * 5 ) ) );
|
||||
v16_d6 = vreinterpretq_s16_u16( vsubl_u8( vld1_u8( pui8_blk1 + i_stride1 * 6 ), vld1_u8( pui8_blk2 + i_stride2 * 6 ) ) );
|
||||
v16_d7 = vreinterpretq_s16_u16( vsubl_u8( vld1_u8( pui8_blk1 + i_stride1 * 7 ), vld1_u8( pui8_blk2 + i_stride2 * 7 ) ) );
|
||||
|
||||
HADAMARD_NEON_4x2( v16_d0, v16_d1, v16_d2, v16_d3 );
|
||||
HADAMARD_NEON_4x2( v16_d4, v16_d5, v16_d6, v16_d7 );
|
||||
|
||||
ADDSUB( v16_d0, v16_d4 );
|
||||
ADDSUB( v16_d1, v16_d5 );
|
||||
ADDSUB( v16_d2, v16_d6 );
|
||||
ADDSUB( v16_d3, v16_d7 );
|
||||
|
||||
v16_a0 = vcombine_s16( vget_low_s16( v16_d0 ), vget_low_s16( v16_d4 ) );
|
||||
v16_a1 = vcombine_s16( vget_high_s16( v16_d0 ), vget_high_s16( v16_d4 ) );
|
||||
ADDSUB( v16_a0, v16_a1 );
|
||||
ABSADDL( v16_res, v16_a0, v16_a1 );
|
||||
|
||||
v16_a0 = vcombine_s16( vget_low_s16( v16_d1 ), vget_low_s16( v16_d5 ) );
|
||||
v16_a1 = vcombine_s16( vget_high_s16( v16_d1 ), vget_high_s16( v16_d5 ) );
|
||||
ADDSUB( v16_a0, v16_a1 );
|
||||
ABSADDL( v16_res, v16_a0, v16_a1 );
|
||||
|
||||
v16_a0 = vcombine_s16( vget_low_s16( v16_d2 ), vget_low_s16( v16_d6 ) );
|
||||
v16_a1 = vcombine_s16( vget_high_s16( v16_d2 ), vget_high_s16( v16_d6 ) );
|
||||
ADDSUB( v16_a0, v16_a1 );
|
||||
ABSADDL( v16_res, v16_a0, v16_a1 );
|
||||
|
||||
v16_a0 = vcombine_s16( vget_low_s16( v16_d3 ), vget_low_s16( v16_d7 ) );
|
||||
v16_a1 = vcombine_s16( vget_high_s16( v16_d3 ), vget_high_s16( v16_d7 ) );
|
||||
ADDSUB( v16_a0, v16_a1 );
|
||||
ABSADDL( v16_res, v16_a0, v16_a1 );
|
||||
|
||||
v8_hadd0 = vadd_s32( vget_low_s32( v16_res ), vget_high_s32( v16_res ) );
|
||||
v8_hadd1 = vpaddl_s32( v8_hadd0 );
|
||||
|
||||
i64_satd = vget_lane_s64( v8_hadd1, 0 );
|
||||
i_satd = ( int32_t )i64_satd;
|
||||
|
||||
i_satd = ( i_satd + 2 ) >> 2;
|
||||
|
||||
return i_satd;
|
||||
}
|
||||
|
||||
|
||||
int32_t y262_satd_16x16_neon( uint8_t *pui8_blk1, int32_t i_stride1, uint8_t *pui8_blk2, int32_t i_stride2 )
|
||||
{
|
||||
int32_t i_satd;
|
||||
|
||||
i_satd = y262_satd_8x8_neon( pui8_blk1, i_stride1, pui8_blk2, i_stride2 );
|
||||
i_satd += y262_satd_8x8_neon( pui8_blk1 + 8, i_stride1, pui8_blk2 + 8, i_stride2 );
|
||||
i_satd += y262_satd_8x8_neon( pui8_blk1 + ( 8 * i_stride1 ), i_stride1, pui8_blk2 + ( 8 * i_stride2 ), i_stride2 );
|
||||
i_satd += y262_satd_8x8_neon( pui8_blk1 + 8 + ( 8 * i_stride1 ), i_stride1, pui8_blk2 + 8 + ( 8 * i_stride2 ), i_stride2 );
|
||||
|
||||
return i_satd;
|
||||
}
|
||||
|
||||
|
||||
int32_t y262_satd_16x8_neon( uint8_t *pui8_blk1, int32_t i_stride1, uint8_t *pui8_blk2, int32_t i_stride2 )
|
||||
{
|
||||
int32_t i_satd;
|
||||
|
||||
i_satd = y262_satd_8x8_neon( pui8_blk1, i_stride1, pui8_blk2, i_stride2 );
|
||||
i_satd += y262_satd_8x8_neon( pui8_blk1 + 8, i_stride1, pui8_blk2 + 8, i_stride2 );
|
||||
|
||||
return i_satd;
|
||||
}
|
||||
|
||||
|
||||
|
||||
int32_t y262_ssd_8x8_neon( uint8_t *pui8_blk1, int32_t i_blk1_stride, uint8_t *pui8_blk2, int32_t i_blk2_stride )
|
||||
{
|
||||
int32_t i_ssd;
|
||||
int16x8_t v16_d0, v16_d1, v16_d2, v16_d3;
|
||||
int32x4_t v16_ssd0, v16_ssd1;
|
||||
int32x2_t v8_hadd0;
|
||||
int64x1_t v8_hadd1;
|
||||
|
||||
v16_d0 = vreinterpretq_s16_u16( vsubl_u8( vld1_u8( pui8_blk1 + ( 0 * i_blk1_stride ) ), vld1_u8( pui8_blk2 + ( 0 * i_blk2_stride ) ) ) );
|
||||
v16_d1 = vreinterpretq_s16_u16( vsubl_u8( vld1_u8( pui8_blk1 + ( 1 * i_blk1_stride ) ), vld1_u8( pui8_blk2 + ( 1 * i_blk2_stride ) ) ) );
|
||||
v16_d2 = vreinterpretq_s16_u16( vsubl_u8( vld1_u8( pui8_blk1 + ( 2 * i_blk1_stride ) ), vld1_u8( pui8_blk2 + ( 2 * i_blk2_stride ) ) ) );
|
||||
v16_d3 = vreinterpretq_s16_u16( vsubl_u8( vld1_u8( pui8_blk1 + ( 3 * i_blk1_stride ) ), vld1_u8( pui8_blk2 + ( 3 * i_blk2_stride ) ) ) );
|
||||
|
||||
v16_ssd0 = vmull_s16( vget_low_s16( v16_d0 ), vget_low_s16( v16_d0 ) );
|
||||
v16_ssd1 = vmull_s16( vget_high_s16( v16_d0 ), vget_high_s16( v16_d0 ) );
|
||||
|
||||
v16_ssd0 = vmlal_s16( v16_ssd0, vget_low_s16( v16_d1 ), vget_low_s16( v16_d1 ) );
|
||||
v16_ssd1 = vmlal_s16( v16_ssd1, vget_high_s16( v16_d1 ), vget_high_s16( v16_d1 ) );
|
||||
|
||||
v16_ssd0 = vmlal_s16( v16_ssd0, vget_low_s16( v16_d2 ), vget_low_s16( v16_d2 ) );
|
||||
v16_ssd1 = vmlal_s16( v16_ssd1, vget_high_s16( v16_d2 ), vget_high_s16( v16_d2 ) );
|
||||
|
||||
v16_ssd0 = vmlal_s16( v16_ssd0, vget_low_s16( v16_d3 ), vget_low_s16( v16_d3 ) );
|
||||
v16_ssd1 = vmlal_s16( v16_ssd1, vget_high_s16( v16_d3 ), vget_high_s16( v16_d3 ) );
|
||||
|
||||
v16_d0 = vreinterpretq_s16_u16( vsubl_u8( vld1_u8( pui8_blk1 + ( 4 * i_blk1_stride ) ), vld1_u8( pui8_blk2 + ( 4 * i_blk2_stride ) ) ) );
|
||||
v16_d1 = vreinterpretq_s16_u16( vsubl_u8( vld1_u8( pui8_blk1 + ( 5 * i_blk1_stride ) ), vld1_u8( pui8_blk2 + ( 5 * i_blk2_stride ) ) ) );
|
||||
v16_d2 = vreinterpretq_s16_u16( vsubl_u8( vld1_u8( pui8_blk1 + ( 6 * i_blk1_stride ) ), vld1_u8( pui8_blk2 + ( 6 * i_blk2_stride ) ) ) );
|
||||
v16_d3 = vreinterpretq_s16_u16( vsubl_u8( vld1_u8( pui8_blk1 + ( 7 * i_blk1_stride ) ), vld1_u8( pui8_blk2 + ( 7 * i_blk2_stride ) ) ) );
|
||||
|
||||
v16_ssd0 = vmlal_s16( v16_ssd0, vget_low_s16( v16_d0 ), vget_low_s16( v16_d0 ) );
|
||||
v16_ssd1 = vmlal_s16( v16_ssd1, vget_high_s16( v16_d0 ), vget_high_s16( v16_d0 ) );
|
||||
|
||||
v16_ssd0 = vmlal_s16( v16_ssd0, vget_low_s16( v16_d1 ), vget_low_s16( v16_d1 ) );
|
||||
v16_ssd1 = vmlal_s16( v16_ssd1, vget_high_s16( v16_d1 ), vget_high_s16( v16_d1 ) );
|
||||
|
||||
v16_ssd0 = vmlal_s16( v16_ssd0, vget_low_s16( v16_d2 ), vget_low_s16( v16_d2 ) );
|
||||
v16_ssd1 = vmlal_s16( v16_ssd1, vget_high_s16( v16_d2 ), vget_high_s16( v16_d2 ) );
|
||||
|
||||
v16_ssd0 = vmlal_s16( v16_ssd0, vget_low_s16( v16_d3 ), vget_low_s16( v16_d3 ) );
|
||||
v16_ssd1 = vmlal_s16( v16_ssd1, vget_high_s16( v16_d3 ), vget_high_s16( v16_d3 ) );
|
||||
|
||||
v16_ssd0 = vaddq_s32( v16_ssd0, v16_ssd1 );
|
||||
|
||||
v8_hadd0 = vadd_s32( vget_low_s32( v16_ssd0 ), vget_high_s32( v16_ssd0 ) );
|
||||
v8_hadd0 = vpadd_s32( v8_hadd0, v8_hadd0 );
|
||||
|
||||
i_ssd = vget_lane_s32( v8_hadd0, 0 );
|
||||
|
||||
return i_ssd;
|
||||
}
|
||||
|
||||
int32_t y262_ssd_16x16_neon( uint8_t *pui8_blk1, int32_t i_blk1_stride, uint8_t *pui8_blk2, int32_t i_blk2_stride )
|
||||
{
|
||||
int32_t i_ssd;
|
||||
|
||||
i_ssd = y262_ssd_8x8_neon( pui8_blk1, i_blk1_stride, pui8_blk2, i_blk2_stride );
|
||||
i_ssd += y262_ssd_8x8_neon( pui8_blk1 + 8, i_blk1_stride, pui8_blk2 + 8, i_blk2_stride );
|
||||
i_ssd += y262_ssd_8x8_neon( pui8_blk1 + ( 8 * i_blk1_stride ), i_blk1_stride, pui8_blk2 + ( 8 * i_blk2_stride ), i_blk2_stride );
|
||||
i_ssd += y262_ssd_8x8_neon( pui8_blk1 + 8 + ( 8 * i_blk1_stride ), i_blk1_stride, pui8_blk2 + 8 + ( 8 * i_blk2_stride), i_blk2_stride );
|
||||
return i_ssd;
|
||||
}
|
||||
|
||||
|
||||
void y262_sub_8x8_neon( int16_t *pi16_diff, uint8_t *pui8_src1, int32_t i_stride_src1, uint8_t *pui8_src2, int32_t i_stride_src2 )
|
||||
{
|
||||
int16x8_t v16_d0, v16_d1, v16_d2, v16_d3;
|
||||
|
||||
v16_d0 = vreinterpretq_s16_u16( vsubl_u8( vld1_u8( pui8_src1 + ( 0 * i_stride_src1 ) ), vld1_u8( pui8_src2 + ( 0 * i_stride_src2 ) ) ) );
|
||||
v16_d1 = vreinterpretq_s16_u16( vsubl_u8( vld1_u8( pui8_src1 + ( 1 * i_stride_src1 ) ), vld1_u8( pui8_src2 + ( 1 * i_stride_src2 ) ) ) );
|
||||
v16_d2 = vreinterpretq_s16_u16( vsubl_u8( vld1_u8( pui8_src1 + ( 2 * i_stride_src1 ) ), vld1_u8( pui8_src2 + ( 2 * i_stride_src2 ) ) ) );
|
||||
v16_d3 = vreinterpretq_s16_u16( vsubl_u8( vld1_u8( pui8_src1 + ( 3 * i_stride_src1 ) ), vld1_u8( pui8_src2 + ( 3 * i_stride_src2 ) ) ) );
|
||||
|
||||
vst1q_s16( pi16_diff + 0, v16_d0 );
|
||||
vst1q_s16( pi16_diff + 8, v16_d1 );
|
||||
vst1q_s16( pi16_diff + 16, v16_d2 );
|
||||
vst1q_s16( pi16_diff + 24, v16_d3 );
|
||||
|
||||
v16_d0 = vreinterpretq_s16_u16( vsubl_u8( vld1_u8( pui8_src1 + ( 4 * i_stride_src1 ) ), vld1_u8( pui8_src2 + ( 4 * i_stride_src2 ) ) ) );
|
||||
v16_d1 = vreinterpretq_s16_u16( vsubl_u8( vld1_u8( pui8_src1 + ( 5 * i_stride_src1 ) ), vld1_u8( pui8_src2 + ( 5 * i_stride_src2 ) ) ) );
|
||||
v16_d2 = vreinterpretq_s16_u16( vsubl_u8( vld1_u8( pui8_src1 + ( 6 * i_stride_src1 ) ), vld1_u8( pui8_src2 + ( 6 * i_stride_src2 ) ) ) );
|
||||
v16_d3 = vreinterpretq_s16_u16( vsubl_u8( vld1_u8( pui8_src1 + ( 7 * i_stride_src1 ) ), vld1_u8( pui8_src2 + ( 7 * i_stride_src2 ) ) ) );
|
||||
|
||||
vst1q_s16( pi16_diff + 32, v16_d0 );
|
||||
vst1q_s16( pi16_diff + 40, v16_d1 );
|
||||
vst1q_s16( pi16_diff + 48, v16_d2 );
|
||||
vst1q_s16( pi16_diff + 56, v16_d3 );
|
||||
}
|
||||
|
||||
void y262_add_8x8_neon( uint8_t *pui8_destination, int32_t i_destination_stride, uint8_t *pui8_base, int32_t i_base_stride, int16_t *pi16_difference )
|
||||
{
|
||||
int32_t i_y;
|
||||
|
||||
int16x8_t v16_zero = vmovq_n_s16( 0 );
|
||||
|
||||
for( i_y = 0; i_y < 8; i_y += 2 )
|
||||
{
|
||||
int16x8_t v16_d0, v16_b0, v16_d1, v16_b1;
|
||||
|
||||
v16_d0 = vld1q_s16( pi16_difference + ( i_y * 8 ) );
|
||||
v16_d1 = vld1q_s16( pi16_difference + ( ( i_y + 1 ) * 8 ) );
|
||||
v16_b0 = vreinterpretq_s16_u16( vshll_n_u8( vld1_u8( pui8_base + ( i_y * i_base_stride ) ), 0 ) );
|
||||
v16_b1 = vreinterpretq_s16_u16( vshll_n_u8( vld1_u8( pui8_base + ( ( i_y + 1 ) * i_base_stride ) ), 0 ) );
|
||||
|
||||
v16_d0 = vaddq_s16( v16_d0, v16_b0 );
|
||||
v16_d1 = vaddq_s16( v16_d1, v16_b1 );
|
||||
|
||||
v16_d0 = vmaxq_s16( v16_zero, v16_d0 );
|
||||
v16_d1 = vmaxq_s16( v16_zero, v16_d1 );
|
||||
|
||||
vst1_u8( pui8_destination + ( i_y * i_destination_stride ), vqmovn_u16( vreinterpretq_u16_s16( v16_d0 ) ) );
|
||||
vst1_u8( pui8_destination + ( ( i_y + 1 ) * i_destination_stride ), vqmovn_u16( vreinterpretq_u16_s16( v16_d1 ) ) );
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/* MC */
|
||||
|
||||
#define MC_FUNC_NEON( name, i_width, i_height, hpelidx ) \
|
||||
void y262_motcomp_##name##_put_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride ) \
|
||||
{ \
|
||||
int32_t i_x, i_y; \
|
||||
\
|
||||
if( hpelidx == 0 ) \
|
||||
{ \
|
||||
for( i_y = 0; i_y < i_height; i_y++ ) \
|
||||
{ \
|
||||
if( i_width == 8 ) \
|
||||
{ \
|
||||
uint8x8_t v8_a; \
|
||||
v8_a = vld1_u8( pui8_src ); \
|
||||
vst1_u8( pui8_dst, v8_a ); \
|
||||
} \
|
||||
else if( i_width == 16 ) \
|
||||
{ \
|
||||
uint8x16_t v16_a; \
|
||||
v16_a = vld1q_u8( pui8_src ); \
|
||||
vst1q_u8( pui8_dst, v16_a ); \
|
||||
} \
|
||||
pui8_src += i_src_stride; \
|
||||
pui8_dst += i_dst_stride; \
|
||||
} \
|
||||
} \
|
||||
else if( hpelidx == 1 ) \
|
||||
{ \
|
||||
uint8_t *pui8_src1, *pui8_src2; \
|
||||
\
|
||||
pui8_src1 = pui8_src; \
|
||||
pui8_src2 = pui8_src + 1; \
|
||||
\
|
||||
for( i_y = 0; i_y < i_height; i_y++ ) \
|
||||
{ \
|
||||
if( i_width == 8 ) \
|
||||
{ \
|
||||
uint8x8_t v8_a, v8_b; \
|
||||
v8_a = vld1_u8( pui8_src1 ); \
|
||||
v8_b = vld1_u8( pui8_src2 ); \
|
||||
vst1_u8( pui8_dst, vrhadd_u8( v8_a, v8_b ) ); \
|
||||
} \
|
||||
else if( i_width == 16 ) \
|
||||
{ \
|
||||
uint8x16_t v16_a, v16_b; \
|
||||
v16_a = vld1q_u8( pui8_src1 ); \
|
||||
v16_b = vld1q_u8( pui8_src2 ); \
|
||||
vst1q_u8( pui8_dst, vrhaddq_u8( v16_a, v16_b ) ); \
|
||||
} \
|
||||
pui8_src1 += i_src_stride; \
|
||||
pui8_src2 += i_src_stride; \
|
||||
pui8_dst += i_dst_stride; \
|
||||
} \
|
||||
} \
|
||||
else if( hpelidx == 2 ) \
|
||||
{ \
|
||||
uint8_t *pui8_src1, *pui8_src2; \
|
||||
\
|
||||
pui8_src1 = pui8_src; \
|
||||
pui8_src2 = pui8_src + i_src_stride; \
|
||||
\
|
||||
for( i_y = 0; i_y < i_height; i_y++ ) \
|
||||
{ \
|
||||
if( i_width == 8 ) \
|
||||
{ \
|
||||
uint8x8_t v8_a, v8_b; \
|
||||
v8_a = vld1_u8( pui8_src1 ); \
|
||||
v8_b = vld1_u8( pui8_src2 ); \
|
||||
vst1_u8( pui8_dst, vrhadd_u8( v8_a, v8_b ) ); \
|
||||
} \
|
||||
else if( i_width == 16 ) \
|
||||
{ \
|
||||
uint8x16_t v16_a, v16_b; \
|
||||
v16_a = vld1q_u8( pui8_src1 ); \
|
||||
v16_b = vld1q_u8( pui8_src2 ); \
|
||||
vst1q_u8( pui8_dst, vrhaddq_u8( v16_a, v16_b ) ); \
|
||||
} \
|
||||
pui8_src1 += i_src_stride; \
|
||||
pui8_src2 += i_src_stride; \
|
||||
pui8_dst += i_dst_stride; \
|
||||
} \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
uint8_t *pui8_src1, *pui8_src2, *pui8_src3, *pui8_src4; \
|
||||
uint8x8_t v8_a, v8_b, v8_c, v8_d; \
|
||||
uint8x16_t v16_a, v16_b, v16_c, v16_d; \
|
||||
uint8x8_t v8_one = vmov_n_u8( 1 ); \
|
||||
uint8x16_t v16_one = vmovq_n_u8( 1 ); \
|
||||
\
|
||||
pui8_src1 = pui8_src; \
|
||||
pui8_src2 = pui8_src + 1; \
|
||||
pui8_src3 = pui8_src + i_src_stride; \
|
||||
pui8_src4 = pui8_src + i_src_stride + 1; \
|
||||
\
|
||||
if( i_width == 8 ) \
|
||||
{ \
|
||||
v8_a = vld1_u8( pui8_src1 ); \
|
||||
v8_b = vld1_u8( pui8_src2 ); \
|
||||
} \
|
||||
else if( i_width == 16 ) \
|
||||
{ \
|
||||
v16_a = vld1q_u8( pui8_src1 ); \
|
||||
v16_b = vld1q_u8( pui8_src2 ); \
|
||||
} \
|
||||
for( i_y = 0; i_y < i_height; i_y++ ) \
|
||||
{ \
|
||||
if( i_width == 8 ) \
|
||||
{ \
|
||||
uint8x8_t v8_carry0, v8_carry1; \
|
||||
v8_c = vld1_u8( pui8_src3 ); \
|
||||
v8_d = vld1_u8( pui8_src4 ); \
|
||||
\
|
||||
v8_carry0 = veor_u8( v8_a, v8_c); \
|
||||
v8_carry1 = veor_u8( v8_b, v8_d); \
|
||||
\
|
||||
v8_a = vrhadd_u8( v8_a, v8_c ); \
|
||||
v8_b = vrhadd_u8( v8_b, v8_d ); \
|
||||
v8_carry0 = vorr_u8( v8_carry0, v8_carry1 ); \
|
||||
\
|
||||
v8_carry1 = veor_u8( v8_a, v8_b); \
|
||||
v8_carry0 = vand_u8( v8_carry0, v8_carry1 ); \
|
||||
v8_carry0 = vand_u8( v8_carry0, v8_one ); \
|
||||
\
|
||||
v8_a = vrhadd_u8( v8_a, v8_b ); \
|
||||
v8_a = vsub_u8( v8_a, v8_carry0 ); \
|
||||
\
|
||||
vst1_u8( pui8_dst, v8_a ); \
|
||||
\
|
||||
v8_a = v8_c; \
|
||||
v8_b = v8_d; \
|
||||
} \
|
||||
else if( i_width == 16 ) \
|
||||
{ \
|
||||
uint8x16_t v16_carry0, v16_carry1; \
|
||||
v16_c = vld1q_u8( pui8_src3 ); \
|
||||
v16_d = vld1q_u8( pui8_src4 ); \
|
||||
\
|
||||
v16_carry0 = veorq_u8( v16_a, v16_c); \
|
||||
v16_carry1 = veorq_u8( v16_b, v16_d); \
|
||||
\
|
||||
v16_a = vrhaddq_u8( v16_a, v16_c ); \
|
||||
v16_b = vrhaddq_u8( v16_b, v16_d ); \
|
||||
v16_carry0 = vorrq_u8( v16_carry0, v16_carry1 ); \
|
||||
\
|
||||
v16_carry1 = veorq_u8( v16_a, v16_b); \
|
||||
v16_carry0 = vandq_u8( v16_carry0, v16_carry1 ); \
|
||||
v16_carry0 = vandq_u8( v16_carry0, v16_one ); \
|
||||
\
|
||||
v16_a = vrhaddq_u8( v16_a, v16_b ); \
|
||||
v16_a = vsubq_u8( v16_a, v16_carry0 ); \
|
||||
\
|
||||
vst1q_u8( pui8_dst, v16_a ); \
|
||||
\
|
||||
v16_a = v16_c; \
|
||||
v16_b = v16_d; \
|
||||
} \
|
||||
pui8_src1 += i_src_stride; \
|
||||
pui8_src2 += i_src_stride; \
|
||||
pui8_src3 += i_src_stride; \
|
||||
pui8_src4 += i_src_stride; \
|
||||
pui8_dst += i_dst_stride; \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
\
|
||||
void y262_motcomp_##name##_avg_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride ) \
|
||||
{ \
|
||||
int32_t i_x, i_y; \
|
||||
\
|
||||
if( hpelidx == 0 ) \
|
||||
{ \
|
||||
for( i_y = 0; i_y < i_height; i_y++ ) \
|
||||
{ \
|
||||
if( i_width == 8 ) \
|
||||
{ \
|
||||
uint8x8_t v8_a, v8_z; \
|
||||
v8_a = vld1_u8( pui8_src ); \
|
||||
v8_z = vld1_u8( pui8_dst ); \
|
||||
vst1_u8( pui8_dst, vrhadd_u8( v8_a, v8_z ) ); \
|
||||
} \
|
||||
else if( i_width == 16 ) \
|
||||
{ \
|
||||
uint8x16_t v16_a, v16_z; \
|
||||
v16_a = vld1q_u8( pui8_src ); \
|
||||
v16_z = vld1q_u8( pui8_dst ); \
|
||||
vst1q_u8( pui8_dst, vrhaddq_u8( v16_a, v16_z ) ); \
|
||||
} \
|
||||
pui8_src += i_src_stride; \
|
||||
pui8_dst += i_dst_stride; \
|
||||
} \
|
||||
} \
|
||||
else if( hpelidx == 1 ) \
|
||||
{ \
|
||||
uint8_t *pui8_src1, *pui8_src2; \
|
||||
\
|
||||
pui8_src1 = pui8_src; \
|
||||
pui8_src2 = pui8_src + 1; \
|
||||
\
|
||||
for( i_y = 0; i_y < i_height; i_y++ ) \
|
||||
{ \
|
||||
if( i_width == 8 ) \
|
||||
{ \
|
||||
uint8x8_t v8_a, v8_b, v8_z; \
|
||||
v8_a = vld1_u8( pui8_src1 ); \
|
||||
v8_b = vld1_u8( pui8_src2 ); \
|
||||
v8_z = vld1_u8( pui8_dst ); \
|
||||
v8_a = vrhadd_u8( v8_a, v8_b ); \
|
||||
vst1_u8( pui8_dst, vrhadd_u8( v8_a, v8_z ) ); \
|
||||
} \
|
||||
else if( i_width == 16 ) \
|
||||
{ \
|
||||
uint8x16_t v16_a, v16_b, v16_z; \
|
||||
v16_a = vld1q_u8( pui8_src1 ); \
|
||||
v16_b = vld1q_u8( pui8_src2 ); \
|
||||
v16_z = vld1q_u8( pui8_dst ); \
|
||||
v16_a = vrhaddq_u8( v16_a, v16_b ); \
|
||||
vst1q_u8( pui8_dst, vrhaddq_u8( v16_a, v16_z ) ); \
|
||||
} \
|
||||
pui8_src1 += i_src_stride; \
|
||||
pui8_src2 += i_src_stride; \
|
||||
pui8_dst += i_dst_stride; \
|
||||
} \
|
||||
} \
|
||||
else if( hpelidx == 2 ) \
|
||||
{ \
|
||||
uint8_t *pui8_src1, *pui8_src2; \
|
||||
\
|
||||
pui8_src1 = pui8_src; \
|
||||
pui8_src2 = pui8_src + i_src_stride; \
|
||||
\
|
||||
for( i_y = 0; i_y < i_height; i_y++ ) \
|
||||
{ \
|
||||
if( i_width == 8 ) \
|
||||
{ \
|
||||
uint8x8_t v8_a, v8_b, v8_z; \
|
||||
v8_a = vld1_u8( pui8_src1 ); \
|
||||
v8_b = vld1_u8( pui8_src2 ); \
|
||||
v8_z = vld1_u8( pui8_dst ); \
|
||||
v8_a = vrhadd_u8( v8_a, v8_b ); \
|
||||
vst1_u8( pui8_dst, vrhadd_u8( v8_a, v8_z ) ); \
|
||||
} \
|
||||
else if( i_width == 16 ) \
|
||||
{ \
|
||||
uint8x16_t v16_a, v16_b, v16_z; \
|
||||
v16_a = vld1q_u8( pui8_src1 ); \
|
||||
v16_b = vld1q_u8( pui8_src2 ); \
|
||||
v16_z = vld1q_u8( pui8_dst ); \
|
||||
v16_a = vrhaddq_u8( v16_a, v16_b ); \
|
||||
vst1q_u8( pui8_dst, vrhaddq_u8( v16_a, v16_z ) ); \
|
||||
} \
|
||||
pui8_src1 += i_src_stride; \
|
||||
pui8_src2 += i_src_stride; \
|
||||
pui8_dst += i_dst_stride; \
|
||||
} \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
uint8_t *pui8_src1, *pui8_src2, *pui8_src3, *pui8_src4; \
|
||||
uint8x8_t v8_a, v8_b, v8_c, v8_d, v8_z; \
|
||||
uint8x16_t v16_a, v16_b, v16_c, v16_d, v16_z; \
|
||||
uint8x8_t v8_one = vmov_n_u8( 1 ); \
|
||||
uint8x16_t v16_one = vmovq_n_u8( 1 ); \
|
||||
\
|
||||
pui8_src1 = pui8_src; \
|
||||
pui8_src2 = pui8_src + 1; \
|
||||
pui8_src3 = pui8_src + i_src_stride; \
|
||||
pui8_src4 = pui8_src + i_src_stride + 1; \
|
||||
\
|
||||
if( i_width == 8 ) \
|
||||
{ \
|
||||
v8_a = vld1_u8( pui8_src1 ); \
|
||||
v8_b = vld1_u8( pui8_src2 ); \
|
||||
} \
|
||||
else if( i_width == 16 ) \
|
||||
{ \
|
||||
v16_a = vld1q_u8( pui8_src1 ); \
|
||||
v16_b = vld1q_u8( pui8_src2 ); \
|
||||
} \
|
||||
for( i_y = 0; i_y < i_height; i_y++ ) \
|
||||
{ \
|
||||
if( i_width == 8 ) \
|
||||
{ \
|
||||
uint8x8_t v8_carry0, v8_carry1; \
|
||||
v8_c = vld1_u8( pui8_src3 ); \
|
||||
v8_d = vld1_u8( pui8_src4 ); \
|
||||
v8_z = vld1_u8( pui8_dst ); \
|
||||
\
|
||||
v8_carry0 = veor_u8( v8_a, v8_c); \
|
||||
v8_carry1 = veor_u8( v8_b, v8_d); \
|
||||
\
|
||||
v8_a = vrhadd_u8( v8_a, v8_c ); \
|
||||
v8_b = vrhadd_u8( v8_b, v8_d ); \
|
||||
v8_carry0 = vorr_u8( v8_carry0, v8_carry1 ); \
|
||||
\
|
||||
v8_carry1 = veor_u8( v8_a, v8_b); \
|
||||
v8_carry0 = vand_u8( v8_carry0, v8_carry1 ); \
|
||||
v8_carry0 = vand_u8( v8_carry0, v8_one ); \
|
||||
\
|
||||
v8_a = vrhadd_u8( v8_a, v8_b ); \
|
||||
v8_a = vsub_u8( v8_a, v8_carry0 ); \
|
||||
\
|
||||
vst1_u8( pui8_dst, vrhadd_u8( v8_a, v8_z ) ); \
|
||||
\
|
||||
v8_a = v8_c; \
|
||||
v8_b = v8_d; \
|
||||
} \
|
||||
else if( i_width == 16 ) \
|
||||
{ \
|
||||
uint8x16_t v16_carry0, v16_carry1; \
|
||||
v16_c = vld1q_u8( pui8_src3 ); \
|
||||
v16_d = vld1q_u8( pui8_src4 ); \
|
||||
v16_z = vld1q_u8( pui8_dst ); \
|
||||
\
|
||||
v16_carry0 = veorq_u8( v16_a, v16_c); \
|
||||
v16_carry1 = veorq_u8( v16_b, v16_d); \
|
||||
\
|
||||
v16_a = vrhaddq_u8( v16_a, v16_c ); \
|
||||
v16_b = vrhaddq_u8( v16_b, v16_d ); \
|
||||
v16_carry0 = vorrq_u8( v16_carry0, v16_carry1 ); \
|
||||
\
|
||||
v16_carry1 = veorq_u8( v16_a, v16_b); \
|
||||
v16_carry0 = vandq_u8( v16_carry0, v16_carry1 ); \
|
||||
v16_carry0 = vandq_u8( v16_carry0, v16_one ); \
|
||||
\
|
||||
v16_a = vrhaddq_u8( v16_a, v16_b ); \
|
||||
v16_a = vsubq_u8( v16_a, v16_carry0 ); \
|
||||
\
|
||||
vst1q_u8( pui8_dst, vrhaddq_u8( v16_a, v16_z ) ); \
|
||||
\
|
||||
v16_a = v16_c; \
|
||||
v16_b = v16_d; \
|
||||
} \
|
||||
pui8_src1 += i_src_stride; \
|
||||
pui8_src2 += i_src_stride; \
|
||||
pui8_src3 += i_src_stride; \
|
||||
pui8_src4 += i_src_stride; \
|
||||
pui8_dst += i_dst_stride; \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
|
||||
|
||||
MC_FUNC_NEON( 16x16_00, 16, 16, 0 );
|
||||
MC_FUNC_NEON( 16x16_01, 16, 16, 1 );
|
||||
MC_FUNC_NEON( 16x16_10, 16, 16, 2 );
|
||||
MC_FUNC_NEON( 16x16_11, 16, 16, 3 );
|
||||
|
||||
MC_FUNC_NEON( 16x8_00, 16, 8, 0 );
|
||||
MC_FUNC_NEON( 16x8_01, 16, 8, 1 );
|
||||
MC_FUNC_NEON( 16x8_10, 16, 8, 2 );
|
||||
MC_FUNC_NEON( 16x8_11, 16, 8, 3 );
|
||||
|
||||
MC_FUNC_NEON( 8x16_00, 8, 16, 0 );
|
||||
MC_FUNC_NEON( 8x16_01, 8, 16, 1 );
|
||||
MC_FUNC_NEON( 8x16_10, 8, 16, 2 );
|
||||
MC_FUNC_NEON( 8x16_11, 8, 16, 3 );
|
||||
|
||||
MC_FUNC_NEON( 8x8_00, 8, 8, 0 );
|
||||
MC_FUNC_NEON( 8x8_01, 8, 8, 1 );
|
||||
MC_FUNC_NEON( 8x8_10, 8, 8, 2 );
|
||||
MC_FUNC_NEON( 8x8_11, 8, 8, 3 );
|
||||
|
||||
MC_FUNC_NEON( 8x4_00, 8, 4, 0 );
|
||||
MC_FUNC_NEON( 8x4_01, 8, 4, 1 );
|
||||
MC_FUNC_NEON( 8x4_10, 8, 4, 2 );
|
||||
MC_FUNC_NEON( 8x4_11, 8, 4, 3 );
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
42
src/y262/pixelop_arm64.h
Normal file
42
src/y262/pixelop_arm64.h
Normal file
|
@ -0,0 +1,42 @@
|
|||
/*
|
||||
Copyright (c) 2013, Ralf Willenbacher
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions
|
||||
are met:
|
||||
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
|
||||
FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
|
||||
COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
|
||||
INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
|
||||
BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
||||
LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
|
||||
ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
int32_t y262_sad_16x16_neon( uint8_t *pui8_blk1, int32_t i_stride1, uint8_t *pui8_blk2, int32_t i_stride2 );
|
||||
int32_t y262_sad_16x8_neon( uint8_t *pui8_blk1, int32_t i_stride1, uint8_t *pui8_blk2, int32_t i_stride2 );
|
||||
|
||||
int32_t y262_satd_16x16_neon( uint8_t *pui8_blk1, int32_t i_stride1, uint8_t *pui8_blk2, int32_t i_stride2 );
|
||||
int32_t y262_satd_16x8_neon( uint8_t *pui8_blk1, int32_t i_stride1, uint8_t *pui8_blk2, int32_t i_stride2 );
|
||||
|
||||
int32_t y262_ssd_8x8_neon( uint8_t *pui8_blk1, int32_t i_blk1_stride, uint8_t *pui8_blk2, int32_t i_blk2_stride );
|
||||
int32_t y262_ssd_16x16_neon( uint8_t *pui8_blk1, int32_t i_blk1_stride, uint8_t *pui8_blk2, int32_t i_blk2_stride );
|
||||
|
||||
void y262_sub_8x8_neon( int16_t *pi16_diff, uint8_t *pui8_src1, int32_t i_stride_src1, uint8_t *pui8_src2, int32_t i_stride_src2 );
|
||||
void y262_add_8x8_neon( uint8_t *pui8_destination, int32_t i_destination_stride, uint8_t *pui8_base, int32_t i_base_stride, int16_t *pi16_difference );
|
||||
|
444
src/y262/transform_arm64.c
Normal file
444
src/y262/transform_arm64.c
Normal file
|
@ -0,0 +1,444 @@
|
|||
/*
|
||||
Copyright (c) 2013, Ralf Willenbacher
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions
|
||||
are met:
|
||||
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
|
||||
FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
|
||||
COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
|
||||
INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
|
||||
BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
||||
LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
|
||||
ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
//#ifdef ASSEMBLY_ARM64
|
||||
|
||||
#include <arm_neon.h>
|
||||
|
||||
#include "y262.h"
|
||||
|
||||
|
||||
#define RND1BITS ( 11 )
|
||||
#define RND2BITS ( 31 - RND1BITS )
|
||||
|
||||
|
||||
static const int16_t rgi16_y262_fdct_neon_cs1[ 8 ][ 8 ] = {
|
||||
{ 16383, 16383, 16383, 16383, 16383, 16383, 16383, 16383 },
|
||||
{ 22724, 19265, 12872, 4520, -4520, -12872, -19265, -22724 },
|
||||
{ 21406, 8867, -8867, -21406, -21406, -8867, 8867, 21406 },
|
||||
{ 19265, -4520, -22724, -12872, 12872, 22724, 4520, -19265 },
|
||||
{ 16383, -16383, -16383, 16383, 16383, -16383, -16383, 16383 },
|
||||
{ 12872, -22724, 4520, 19265, -19265, -4520, 22724, -12872 },
|
||||
{ 8867, -21406, 21406, -8867, -8867, 21406, -21406, 8867 },
|
||||
{ 4520, -12872, 19265, -22724, 22724, -19265, 12872, -4520 },
|
||||
};
|
||||
static const int16_t rgi16_y262_fdct_neon_cs2[ 32 ] = {
|
||||
16385, 16385, 22726, 19266, -8867, -21408, -22726, -12873,
|
||||
16385, 16385, 12873, 4521, 21408, 8867, 19266, -4521,
|
||||
16385, -16385, 12873, -22726, 21408, -8867, 19266, -22726,
|
||||
-16385, 16385, 4521, 19266, 8867, -21408, 4521, -12873,
|
||||
};
|
||||
|
||||
|
||||
void y262_fdct_neon( int16_t *pi16_block, int16_t *pi16_dst )
|
||||
{
|
||||
int i_i, i_j, i_k;
|
||||
int16x8_t rgv16_tmp[ 8 ], rgv16_dsts[ 8 ];
|
||||
int16x8_t rgv16_e[ 4 ], rgv16_ee[ 2 ];
|
||||
int32x4x2_t rgv32_transA[ 4 ];
|
||||
int16x8x2_t rgv32_transB[ 4 ];
|
||||
int16x4_t v8_mt0, v8_mt1, v8_mt2, v8_mt3, v8_mt4, v8_mt5, v8_mt6, v8_mt7;
|
||||
|
||||
#define RND( x, y ) ( ( ( x ) + ( ( y ) ? ( 1 << ( y - 1 ) ) : 0 ) ) >> ( y ) )
|
||||
#define MUL( x, m ) ( ( x ) * ( m ) )
|
||||
|
||||
for( i_j = 1; i_j < 8; i_j += 2 )
|
||||
{
|
||||
int16x8_t v16_d;
|
||||
int32x4_t v16_s0, v16_s1;
|
||||
|
||||
v16_d = vsubq_s16( vld1q_s16( &pi16_block[ 8 * 0 ] ), vld1q_s16( &pi16_block[ 8 * 7 ] ) );
|
||||
v16_s0 = vmull_n_s16( vget_low_s16( v16_d ), rgi16_y262_fdct_neon_cs1[ i_j ][ 0 ] );
|
||||
v16_s1 = vmull_n_s16( vget_high_s16( v16_d ), rgi16_y262_fdct_neon_cs1[ i_j ][ 0 ] );
|
||||
|
||||
for( i_k = 1; i_k < 4; i_k++ )
|
||||
{
|
||||
v16_d = vsubq_s16( vld1q_s16( &pi16_block[ 8 * i_k ] ), vld1q_s16( &pi16_block[ 8 * ( 7 - i_k ) ] ) );
|
||||
v16_s0 = vmlal_n_s16( v16_s0, vget_low_s16( v16_d ), rgi16_y262_fdct_neon_cs1[ i_j ][ i_k ] );
|
||||
v16_s1 = vmlal_n_s16( v16_s1, vget_high_s16( v16_d ), rgi16_y262_fdct_neon_cs1[ i_j ][ i_k ] );
|
||||
}
|
||||
|
||||
rgv16_tmp[ i_j ] = vcombine_s16( vqrshrn_n_s32( v16_s0, RND1BITS ), vqrshrn_n_s32( v16_s1, RND1BITS ) );
|
||||
}
|
||||
|
||||
for ( i_k = 0; i_k < 4; i_k++ )
|
||||
{
|
||||
rgv16_e[ i_k ] = vaddq_s16( vld1q_s16( &pi16_block[ 8 * i_k ] ), vld1q_s16( &pi16_block[ 8 * ( 7 - i_k ) ] ) );
|
||||
}
|
||||
|
||||
for( i_j = 2; i_j < 8; i_j += 4 )
|
||||
{
|
||||
int16x8_t v16_d;
|
||||
int32x4_t v16_s0, v16_s1;
|
||||
|
||||
v16_d = vsubq_s16( rgv16_e[ 0 ], rgv16_e[ 3 ] );
|
||||
v16_s0 = vmull_n_s16( vget_low_s16( v16_d ), rgi16_y262_fdct_neon_cs1[ i_j ][ 0 ] );
|
||||
v16_s1 = vmull_n_s16( vget_high_s16( v16_d ), rgi16_y262_fdct_neon_cs1[ i_j ][ 0 ] );
|
||||
|
||||
v16_d = vsubq_s16( rgv16_e[ 1 ], rgv16_e[ 2 ] );
|
||||
v16_s0 = vmlal_n_s16( v16_s0, vget_low_s16( v16_d ), rgi16_y262_fdct_neon_cs1[ i_j ][ 1 ] );
|
||||
v16_s1 = vmlal_n_s16( v16_s1, vget_high_s16( v16_d ), rgi16_y262_fdct_neon_cs1[ i_j ][ 1 ] );
|
||||
|
||||
rgv16_tmp[ i_j ] = vcombine_s16( vqrshrn_n_s32( v16_s0, RND1BITS ), vqrshrn_n_s32( v16_s1, RND1BITS ) );
|
||||
}
|
||||
for ( i_k = 0; i_k < 2; i_k++ )
|
||||
{
|
||||
rgv16_ee[ i_k ] = vaddq_s16( rgv16_e[ i_k ], rgv16_e[ 3 - i_k ] );
|
||||
}
|
||||
for( i_j = 0; i_j < 8; i_j += 4 )
|
||||
{
|
||||
int16x8_t v16_d;
|
||||
int32x4_t v16_s0, v16_s1;
|
||||
|
||||
v16_s0 = vmull_n_s16( vget_low_s16( rgv16_ee[ 0 ] ), rgi16_y262_fdct_neon_cs1[ i_j ][ 0 ] );
|
||||
v16_s1 = vmull_n_s16( vget_high_s16( rgv16_ee[ 0 ] ), rgi16_y262_fdct_neon_cs1[ i_j ][ 0 ] );
|
||||
|
||||
v16_s0 = vmlal_n_s16( v16_s0, vget_low_s16( rgv16_ee[ 1 ] ), rgi16_y262_fdct_neon_cs1[ i_j ][ 1 ] );
|
||||
v16_s1 = vmlal_n_s16( v16_s1, vget_high_s16( rgv16_ee[ 1 ] ), rgi16_y262_fdct_neon_cs1[ i_j ][ 1 ] );
|
||||
|
||||
rgv16_tmp[ i_j ] = vcombine_s16( vqrshrn_n_s32( v16_s0, RND1BITS ), vqrshrn_n_s32( v16_s1, RND1BITS ) );
|
||||
}
|
||||
|
||||
v8_mt0 = vld1_s16( &rgi16_y262_fdct_neon_cs2[ 0 ] );
|
||||
v8_mt1 = vld1_s16( &rgi16_y262_fdct_neon_cs2[ 4 ] );
|
||||
v8_mt2 = vld1_s16( &rgi16_y262_fdct_neon_cs2[ 8 ] );
|
||||
v8_mt3 = vld1_s16( &rgi16_y262_fdct_neon_cs2[ 12 ] );
|
||||
v8_mt4 = vld1_s16( &rgi16_y262_fdct_neon_cs2[ 16 ] );
|
||||
v8_mt5 = vld1_s16( &rgi16_y262_fdct_neon_cs2[ 20 ] );
|
||||
v8_mt6 = vld1_s16( &rgi16_y262_fdct_neon_cs2[ 24 ] );
|
||||
v8_mt7 = vld1_s16( &rgi16_y262_fdct_neon_cs2[ 28 ] );
|
||||
|
||||
for( i_j = 0; i_j < 8; i_j++ )
|
||||
{
|
||||
int16x4_t v8_l0, v8_l1, v8_o, v8_e, v8_m0, v8_m1, v8_m2, v8_m3;
|
||||
int16x4x2_t v16_trn0;
|
||||
int32x2x2_t v16_trn1;
|
||||
int32x4_t v16_s0, v16_s1, v16_s2, v16_s3;
|
||||
int16x8_t v16_row;
|
||||
|
||||
v8_l0 = vget_low_s16( rgv16_tmp[ i_j ] );
|
||||
v8_l1 = vget_high_s16( rgv16_tmp[ i_j ] );
|
||||
v8_l1 = vrev64_s16( v8_l1 );
|
||||
v8_o = vsub_s16( v8_l0, v8_l1 );
|
||||
v8_e = vadd_s16( v8_l0, v8_l1 );
|
||||
|
||||
v16_trn1 = vzip_s32( vreinterpret_s32_s16( v8_e ), vreinterpret_s32_s16( v8_o ) );
|
||||
v8_m0 = vreinterpret_s16_s32( v16_trn1.val[ 0 ] );
|
||||
v8_m1 = vreinterpret_s16_s32( v16_trn1.val[ 1 ] );
|
||||
|
||||
v16_s0 = vmull_s16( v8_m0, v8_mt0 );
|
||||
v16_s1 = vmull_s16( v8_m1, v8_mt1 );
|
||||
v16_s0 = vmlal_s16( v16_s0, v8_m1, v8_mt2 );
|
||||
v16_s1 = vmlal_s16( v16_s1, v8_m0, v8_mt3 );
|
||||
|
||||
v16_s2 = vmull_s16( v8_m0, v8_mt4 );
|
||||
v16_s3 = vmull_s16( v8_m1, v8_mt5 );
|
||||
v16_s2 = vmlal_s16( v16_s2, v8_m1, v8_mt6 );
|
||||
v16_s3 = vmlal_s16( v16_s3, v8_m0, v8_mt7 );
|
||||
|
||||
v16_s0 = vpaddq_s32( v16_s0, v16_s1 );
|
||||
v16_s1 = vpaddq_s32( v16_s2, v16_s3 );
|
||||
|
||||
v16_row = vcombine_s16( vmovn_s32( vrshrq_n_s32( v16_s0, RND2BITS ) ), vmovn_s32(vrshrq_n_s32( v16_s1, RND2BITS ) ) );
|
||||
vst1q_s16( pi16_dst + ( 8 * i_j ), v16_row );
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#define RND1BITS ( 11 )
|
||||
#define RND2BITS ( 31 - RND1BITS )
|
||||
|
||||
static const int16_t rgi16_y262_idct_cs1[ 8 ][ 8 ] = {
|
||||
{ 16383, 16383, 16383, 16383, 16383, 16383, 16383, 16383 },
|
||||
{ 22724, 19265, 12872, 4520, -4520, -12872, -19265, -22724 },
|
||||
{ 21406, 8867, -8867, -21406, -21406, -8867, 8867, 21406 },
|
||||
{ 19265, -4520, -22724, -12872, 12872, 22724, 4520, -19265 },
|
||||
{ 16383, -16383, -16383, 16383, 16383, -16383, -16383, 16383 },
|
||||
{ 12872, -22724, 4520, 19265, -19265, -4520, 22724, -12872 },
|
||||
{ 8867, -21406, 21406, -8867, -8867, 21406, -21406, 8867 },
|
||||
{ 4520, -12872, 19265, -22724, 22724, -19265, 12872, -4520 },
|
||||
};
|
||||
static const int16_t rgi16_y262_idct_cs2[ 8 ][ 8 ] = {
|
||||
{ 16385, 16385, 16385, 16385, 16385, 16385, 16385, 16385 },
|
||||
{ 22726, 19266, 12873, 4521, -4521, -12873, -19266, -22726 },
|
||||
{ 21408, 8867, -8867, -21408, -21408, -8867, 8867, 21408 },
|
||||
{ 19266, -4521, -22726, -12873, 12873, 22726, 4521, -19266 },
|
||||
{ 16385, -16385, -16385, 16385, 16385, -16385, -16385, 16385 },
|
||||
{ 12873, -22726, 4521, 19266, -19266, -4521, 22726, -12873 },
|
||||
{ 8867, -21408, 21408, -8867, -8867, 21408, -21408, 8867 },
|
||||
{ 4521, -12873, 19266, -22726, 22726, -19266, 12873, -4521 },
|
||||
};
|
||||
|
||||
static const int16_t rgi16_y262_idct_neon_cs2[ 32 ] = {
|
||||
16385, 21408, 16385, 8867, 16385, -8867, 16385, -21408,
|
||||
16385, 8867, -16385, -21408, -16385, 21408, 16385, -8867,
|
||||
22726, 19266, 19266, -4521, 12873, -22726, 4521, -12873,
|
||||
12873, 4521, -22726, -12873, 4521, 19266, 19266, -22726
|
||||
};
|
||||
|
||||
|
||||
|
||||
void y262_idct_neon( int16_t *pi16_block, int16_t *pi16_dst )
|
||||
{
|
||||
int i_j, i_k;
|
||||
int16_t rgi16_tmp[ 64 ];
|
||||
int32_t rgi_e[ 4 ], rgi_o[ 4 ];
|
||||
int32_t rgi_ee[ 2 ], rgi_eo[ 2 ];
|
||||
int32x4_t rgv16_o[ 4 ];
|
||||
int32x4_t rgv16_eo[ 4 ];
|
||||
int32x4_t rgv16_ee[ 4 ];
|
||||
int32x4_t rgv16_e[ 4 ];
|
||||
int16x4_t rgv8_tmp[ 8 ][ 2 ];
|
||||
int16x4_t v8_mt0, v8_mt1, v8_mt2, v8_mt3, v8_mt4, v8_mt5, v8_mt6, v8_mt7;
|
||||
|
||||
#define RND( x, y ) ( ( ( x ) + ( ( y ) ? ( 1 << ( y - 1 ) ) : 0 ) ) >> ( y ) )
|
||||
#define MUL( x, m ) ( ( x ) * ( m ) )
|
||||
|
||||
|
||||
for( i_j = 0; i_j < 2; i_j++ )
|
||||
{
|
||||
int16x4_t v8_b0, v8_b1, v8_b2, v8_b3;
|
||||
|
||||
v8_b0 = vld1_s16( pi16_block + ( 8 * 1 ) + ( i_j * 4 ) );
|
||||
v8_b1 = vld1_s16( pi16_block + ( 8 * 3 ) + ( i_j * 4 ) );
|
||||
v8_b2 = vld1_s16( pi16_block + ( 8 * 5 ) + ( i_j * 4 ) );
|
||||
v8_b3 = vld1_s16( pi16_block + ( 8 * 7 ) + ( i_j * 4 ) );
|
||||
|
||||
rgv16_o[ 0 ] = vmull_n_s16( v8_b0, rgi16_y262_idct_cs1[ 1 ][ 0 ] );
|
||||
rgv16_o[ 0 ] = vmlal_n_s16( rgv16_o[ 0 ], v8_b1, rgi16_y262_idct_cs1[ 3 ][ 0 ] );
|
||||
rgv16_o[ 0 ] = vmlal_n_s16( rgv16_o[ 0 ], v8_b2, rgi16_y262_idct_cs1[ 5 ][ 0 ] );
|
||||
rgv16_o[ 0 ] = vmlal_n_s16( rgv16_o[ 0 ], v8_b3, rgi16_y262_idct_cs1[ 7 ][ 0 ] );
|
||||
|
||||
rgv16_o[ 1 ] = vmull_n_s16( v8_b0, rgi16_y262_idct_cs1[ 1 ][ 1 ] );
|
||||
rgv16_o[ 1 ] = vmlal_n_s16( rgv16_o[ 1 ], v8_b1, rgi16_y262_idct_cs1[ 3 ][ 1 ] );
|
||||
rgv16_o[ 1 ] = vmlal_n_s16( rgv16_o[ 1 ], v8_b2, rgi16_y262_idct_cs1[ 5 ][ 1 ] );
|
||||
rgv16_o[ 1 ] = vmlal_n_s16( rgv16_o[ 1 ], v8_b3, rgi16_y262_idct_cs1[ 7 ][ 1 ] );
|
||||
|
||||
rgv16_o[ 2 ] = vmull_n_s16( v8_b0, rgi16_y262_idct_cs1[ 1 ][ 2 ] );
|
||||
rgv16_o[ 2 ] = vmlal_n_s16( rgv16_o[ 2 ], v8_b1, rgi16_y262_idct_cs1[ 3 ][ 2 ] );
|
||||
rgv16_o[ 2 ] = vmlal_n_s16( rgv16_o[ 2 ], v8_b2, rgi16_y262_idct_cs1[ 5 ][ 2 ] );
|
||||
rgv16_o[ 2 ] = vmlal_n_s16( rgv16_o[ 2 ], v8_b3, rgi16_y262_idct_cs1[ 7 ][ 2 ] );
|
||||
|
||||
rgv16_o[ 3 ] = vmull_n_s16( v8_b0, rgi16_y262_idct_cs1[ 1 ][ 3 ] );
|
||||
rgv16_o[ 3 ] = vmlal_n_s16( rgv16_o[ 3 ], v8_b1, rgi16_y262_idct_cs1[ 3 ][ 3 ] );
|
||||
rgv16_o[ 3 ] = vmlal_n_s16( rgv16_o[ 3 ], v8_b2, rgi16_y262_idct_cs1[ 5 ][ 3 ] );
|
||||
rgv16_o[ 3 ] = vmlal_n_s16( rgv16_o[ 3 ], v8_b3, rgi16_y262_idct_cs1[ 7 ][ 3 ] );
|
||||
|
||||
v8_b0 = vld1_s16( pi16_block + ( 8 * 2 ) + ( i_j * 4 ) );
|
||||
v8_b1 = vld1_s16( pi16_block + ( 8 * 6 ) + ( i_j * 4 ) );
|
||||
v8_b2 = vld1_s16( pi16_block + ( 8 * 0 ) + ( i_j * 4 ) );
|
||||
v8_b3 = vld1_s16( pi16_block + ( 8 * 4 ) + ( i_j * 4 ) );
|
||||
|
||||
rgv16_eo[ 0 ] = vmull_n_s16( v8_b0, rgi16_y262_idct_cs1[ 2 ][ 0 ] );
|
||||
rgv16_eo[ 0 ] = vmlal_n_s16( rgv16_eo[ 0 ], v8_b1, rgi16_y262_idct_cs1[ 6 ][ 0 ] );
|
||||
rgv16_eo[ 1 ] = vmull_n_s16( v8_b0, rgi16_y262_idct_cs1[ 2 ][ 1 ] );
|
||||
rgv16_eo[ 1 ] = vmlal_n_s16( rgv16_eo[ 1 ], v8_b1, rgi16_y262_idct_cs1[ 6 ][ 1 ] );
|
||||
rgv16_ee[ 0 ] = vmull_n_s16( v8_b2, rgi16_y262_idct_cs1[ 0 ][ 0 ] );
|
||||
rgv16_ee[ 0 ] = vmlal_n_s16( rgv16_ee[ 0 ], v8_b3, rgi16_y262_idct_cs1[ 4 ][ 0 ] );
|
||||
rgv16_ee[ 1 ] = vmull_n_s16( v8_b2, rgi16_y262_idct_cs1[ 0 ][ 1 ] );
|
||||
rgv16_ee[ 1 ] = vmlal_n_s16( rgv16_ee[ 1 ], v8_b3, rgi16_y262_idct_cs1[ 4 ][ 1 ] );
|
||||
|
||||
rgv16_e[ 0 ] = vaddq_s32( rgv16_ee[ 0 ], rgv16_eo[ 0 ] );
|
||||
rgv16_e[ 1 ] = vaddq_s32( rgv16_ee[ 1 ], rgv16_eo[ 1 ] );
|
||||
rgv16_e[ 2 ] = vsubq_s32( rgv16_ee[ 1 ], rgv16_eo[ 1 ] );
|
||||
rgv16_e[ 3 ] = vsubq_s32( rgv16_ee[ 0 ], rgv16_eo[ 0 ] );
|
||||
|
||||
|
||||
for( i_k = 0; i_k < 4; i_k++ )
|
||||
{
|
||||
int32x4_t v16_eoa, v16_eos;
|
||||
v16_eoa = vaddq_s32( rgv16_e[ i_k ], rgv16_o[ i_k ]);
|
||||
rgv8_tmp[ i_k ][ i_j ] = vqrshrn_n_s32( v16_eoa, RND1BITS );
|
||||
v16_eos = vsubq_s32( rgv16_e[ 3 - i_k ], rgv16_o[ 3 - i_k ]);
|
||||
rgv8_tmp[ i_k + 4 ][ i_j ] = vqrshrn_n_s32( v16_eos, RND1BITS );
|
||||
}
|
||||
}
|
||||
|
||||
v8_mt0 = vld1_s16( &rgi16_y262_idct_neon_cs2[ 0 ] );
|
||||
v8_mt1 = vld1_s16( &rgi16_y262_idct_neon_cs2[ 4 ] );
|
||||
v8_mt2 = vld1_s16( &rgi16_y262_idct_neon_cs2[ 8 ] );
|
||||
v8_mt3 = vld1_s16( &rgi16_y262_idct_neon_cs2[ 12 ] );
|
||||
v8_mt4 = vld1_s16( &rgi16_y262_idct_neon_cs2[ 16 ] );
|
||||
v8_mt5 = vld1_s16( &rgi16_y262_idct_neon_cs2[ 20 ] );
|
||||
v8_mt6 = vld1_s16( &rgi16_y262_idct_neon_cs2[ 24 ] );
|
||||
v8_mt7 = vld1_s16( &rgi16_y262_idct_neon_cs2[ 28 ] );
|
||||
|
||||
for( i_j = 0; i_j < 8; i_j++ )
|
||||
{
|
||||
int16x4_t v8_l02, v8_l46, v8_l13, v8_l57, v8_m0, v8_m1, v8_m2, v8_m3, v8_m4, v8_m5, v8_m6, v8_m7;
|
||||
int16x4x2_t v16_trn0;
|
||||
int32x2x2_t v16_trn1;
|
||||
int32x4_t v16_s0, v16_s1, v16_s2, v16_s3, v16_s4, v16_s5, v16_s6, v16_s7, v16_e, v16_o;
|
||||
int16x8_t v16_row;
|
||||
|
||||
v8_m0 = rgv8_tmp[ i_j ][ 0 ];
|
||||
v8_m1 = rgv8_tmp[ i_j ][ 1 ];
|
||||
|
||||
v16_trn1 = vtrn_s32( vreinterpret_s32_s16( v8_m0 ), vreinterpret_s32_s16( v8_m1 ) );
|
||||
v16_trn0 = vzip_s16( vreinterpret_s16_s32( v16_trn1.val[ 0 ] ),vreinterpret_s16_s32( v16_trn1.val[ 1 ] ) );
|
||||
|
||||
v16_trn1 = vtrn_s32( vreinterpret_s32_s16( v16_trn0.val[ 0 ] ), vreinterpret_s32_s16( v16_trn0.val[ 0 ] ) );
|
||||
v8_l02 = vreinterpret_s16_s32( v16_trn1.val[ 0 ] );
|
||||
v8_l13 = vreinterpret_s16_s32( v16_trn1.val[ 1 ] );
|
||||
|
||||
v16_trn1 = vtrn_s32( vreinterpret_s32_s16( v16_trn0.val[ 1 ] ), vreinterpret_s32_s16( v16_trn0.val[ 1 ] ) );
|
||||
v8_l46 = vreinterpret_s16_s32( v16_trn1.val[ 0 ] );
|
||||
v8_l57 = vreinterpret_s16_s32( v16_trn1.val[ 1 ] );
|
||||
|
||||
v16_s0 = vmull_s16( v8_l02, v8_mt0 );
|
||||
v16_s0 = vmlal_s16( v16_s0, v8_l46, v8_mt2 );
|
||||
v16_s1 = vmull_s16( v8_l02, v8_mt1 );
|
||||
v16_s1 = vmlal_s16( v16_s1, v8_l46, v8_mt3 );
|
||||
v16_s2 = vmull_s16( v8_l13, v8_mt4 );
|
||||
v16_s2 = vmlal_s16( v16_s2, v8_l57, v8_mt6 );
|
||||
v16_s3 = vmull_s16( v8_l13, v8_mt5 );
|
||||
v16_s3 = vmlal_s16( v16_s3, v8_l57, v8_mt7 );
|
||||
|
||||
v16_s0 = vpaddq_s32( v16_s0, v16_s1 );
|
||||
v16_s1 = vpaddq_s32( v16_s2, v16_s3 );
|
||||
|
||||
v16_e = vaddq_s32( v16_s0, v16_s1 );
|
||||
v16_o = vsubq_s32( v16_s0, v16_s1 );
|
||||
v16_o = vcombine_s32( vrev64_s32( vget_high_s32( v16_o ) ), vrev64_s32( vget_low_s32( v16_o ) ) );
|
||||
|
||||
v16_row = vcombine_s16( vmovn_s32( vrshrq_n_s32( v16_e, RND2BITS ) ), vmovn_s32(vrshrq_n_s32( v16_o, RND2BITS ) ) );
|
||||
vst1q_s16( pi16_dst + ( 8 * i_j ), v16_row );
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
int32_t y262_quant8x8_intra_fw_mpeg2_neon( int16_t *pi_coeffs, int32_t i_stride, uint16_t *pui16_qmat, uint16_t *pui16_bias )
|
||||
{
|
||||
int32_t i_y, i_x, i_qm, i_nz, i_intra_dc;
|
||||
int16x8_t v16_zero, v16_nz, v16_2047, v16_m2048;
|
||||
|
||||
v16_nz = v16_zero = vmovq_n_s16( 0 );
|
||||
v16_2047 = vmovq_n_s16( 2047 );
|
||||
v16_m2048 = vmovq_n_s16( -2048 );
|
||||
|
||||
i_intra_dc = pi_coeffs[ 0 ];
|
||||
pi_coeffs[ 0 ] = 0;
|
||||
|
||||
i_nz = 0;
|
||||
for( i_y = 0; i_y < 8; i_y++ )
|
||||
{
|
||||
int16x8_t v16_co;
|
||||
int16x8_t v16_mask;
|
||||
uint16x8_t v16_qm, v16_bias, v16_cou;
|
||||
uint32x4_t v16_res0, v16_res1;
|
||||
|
||||
v16_co = vld1q_s16( pi_coeffs + ( i_y * 8 ) );
|
||||
v16_qm = vld1q_u16( pui16_qmat + ( i_y * 8 ) );
|
||||
v16_bias = vld1q_u16( pui16_bias + ( i_y * 8 ) );
|
||||
v16_mask = vreinterpretq_s16_u16( vcgtq_s16( v16_zero, v16_co ) );
|
||||
v16_co = veorq_s16( v16_co, v16_mask );
|
||||
v16_co = vsubq_s16( v16_co, v16_mask );
|
||||
|
||||
v16_cou = vaddq_u16( vreinterpretq_u16_s16( v16_co ), v16_bias );
|
||||
v16_res0 = vmull_u16( vget_low_u16( v16_cou ), vget_low_u16( v16_qm ) );
|
||||
v16_res1 = vmull_u16( vget_high_u16( v16_cou ), vget_high_u16( v16_qm ) );
|
||||
|
||||
v16_co = vreinterpretq_s16_u16( vcombine_u16( vshrn_n_u32( v16_res0, 16 ), vshrn_n_u32( v16_res1, 16 ) ) );
|
||||
|
||||
v16_co = veorq_s16( v16_co, v16_mask );
|
||||
v16_co = vsubq_s16( v16_co, v16_mask );
|
||||
|
||||
v16_co = vminq_s16( v16_co, v16_2047 );
|
||||
v16_co = vmaxq_s16( v16_co, v16_m2048 );
|
||||
|
||||
vst1q_s16( pi_coeffs + ( i_y * 8 ), v16_co );
|
||||
|
||||
v16_nz = vorrq_s16( v16_co, v16_nz );
|
||||
}
|
||||
|
||||
v16_nz = vnegq_s16( vreinterpretq_s16_u16( vceqzq_s16( v16_nz ) ) );
|
||||
i_nz = vgetq_lane_u32( vreinterpretq_u32_s16( v16_nz ), 0 );
|
||||
i_nz |= vgetq_lane_u32( vreinterpretq_u32_s16( v16_nz ), 1 );
|
||||
i_nz |= vgetq_lane_u32( vreinterpretq_u32_s16( v16_nz ), 2 );
|
||||
i_nz |= vgetq_lane_u32( vreinterpretq_u32_s16( v16_nz ), 3 );
|
||||
|
||||
pi_coeffs[ 0 ] = i_intra_dc;
|
||||
|
||||
return i_nz;
|
||||
}
|
||||
|
||||
|
||||
|
||||
int32_t y262_quant8x8_inter_fw_mpeg2_neon( int16_t *pi_coeffs, int32_t i_stride, uint16_t *pui16_qmat )
|
||||
{
|
||||
int32_t i_y, i_x, i_qm, i_nz;
|
||||
int16x8_t v16_zero, v16_nz, v16_2047, v16_m2048;
|
||||
|
||||
v16_nz = v16_zero = vmovq_n_s16( 0 );
|
||||
v16_2047 = vmovq_n_s16( 2047 );
|
||||
v16_m2048 = vmovq_n_s16( -2048 );
|
||||
|
||||
i_nz = 0;
|
||||
for( i_y = 0; i_y < 8; i_y++ )
|
||||
{
|
||||
int16x8_t v16_co;
|
||||
int16x8_t v16_mask;
|
||||
uint16x8_t v16_qm;
|
||||
uint32x4_t v16_res0, v16_res1;
|
||||
|
||||
v16_co = vld1q_s16( pi_coeffs + ( i_y * 8 ) );
|
||||
v16_qm = vld1q_u16( pui16_qmat + ( i_y * 8 ) );
|
||||
v16_mask = vreinterpretq_s16_u16( vcgtq_s16( v16_zero, v16_co ) );
|
||||
v16_co = veorq_s16( v16_co, v16_mask );
|
||||
v16_co = vsubq_s16( v16_co, v16_mask );
|
||||
|
||||
v16_res0 = vmull_u16( vreinterpret_u16_s16( vget_low_s16( v16_co ) ), vget_low_u16( v16_qm ) );
|
||||
v16_res1 = vmull_u16( vreinterpret_u16_s16( vget_high_s16( v16_co ) ), vget_high_u16( v16_qm ) );
|
||||
|
||||
v16_co = vreinterpretq_s16_u16( vcombine_u16( vshrn_n_u32( v16_res0, 16 ), vshrn_n_u32( v16_res1, 16 ) ) );
|
||||
|
||||
v16_co = veorq_s16( v16_co, v16_mask );
|
||||
v16_co = vsubq_s16( v16_co, v16_mask );
|
||||
|
||||
v16_co = vminq_s16( v16_co, v16_2047 );
|
||||
v16_co = vmaxq_s16( v16_co, v16_m2048 );
|
||||
|
||||
vst1q_s16( pi_coeffs + ( i_y * 8 ), v16_co );
|
||||
|
||||
v16_nz = vorrq_s16( v16_co, v16_nz );
|
||||
}
|
||||
|
||||
v16_nz = vnegq_s16( vreinterpretq_s16_u16( vceqzq_s16( v16_nz ) ) );
|
||||
i_nz = vgetq_lane_u32( vreinterpretq_u32_s16( v16_nz ), 0 );
|
||||
i_nz |= vgetq_lane_u32( vreinterpretq_u32_s16( v16_nz ), 1 );
|
||||
i_nz |= vgetq_lane_u32( vreinterpretq_u32_s16( v16_nz ), 2 );
|
||||
i_nz |= vgetq_lane_u32( vreinterpretq_u32_s16( v16_nz ), 3 );
|
||||
|
||||
return i_nz;
|
||||
}
|
||||
|
||||
|
||||
|
||||
//#endif
|
36
src/y262/transform_arm64.h
Normal file
36
src/y262/transform_arm64.h
Normal file
|
@ -0,0 +1,36 @@
|
|||
/*
|
||||
Copyright (c) 2013, Ralf Willenbacher
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions
|
||||
are met:
|
||||
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
|
||||
FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
|
||||
COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
|
||||
INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
|
||||
BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
||||
LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
|
||||
ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
void y262_fdct_neon( int16_t *pi16_block, int16_t *pi16_dst );
|
||||
void y262_idct_neon( int16_t *pi16_block, int16_t *pi16_dst );
|
||||
|
||||
int32_t y262_quant8x8_intra_fw_mpeg2_neon( int16_t *pi_coeffs, int32_t i_stride, uint16_t *pui16_qmat, uint16_t *pui16_bias );
|
||||
int32_t y262_quant8x8_inter_fw_mpeg2_neon( int16_t *pi_coeffs, int32_t i_stride, uint16_t *pui16_qmat );
|
||||
|
|
@ -59,11 +59,19 @@ POSSIBILITY OF SUCH DAMAGE.
|
|||
#include "transform.h"
|
||||
#include "pixelop.h"
|
||||
#include "me.h"
|
||||
#include "transform_x86.h"
|
||||
#include "pixelop_x86.h"
|
||||
#include "ratectrl.h"
|
||||
#include "threads.h"
|
||||
|
||||
#ifdef ASSEMBLY_X86
|
||||
#include "transform_x86.h"
|
||||
#include "pixelop_x86.h"
|
||||
#endif
|
||||
|
||||
#ifdef ASSEMBLY_ARM64
|
||||
#include "transform_arm64.h"
|
||||
#include "pixelop_arm64.h"
|
||||
#endif
|
||||
|
||||
void y262_init_motion_compensation( y262_t *ps_y262 );
|
||||
void y262_error( y262_t *ps_y262, int32_t i_error_code, int8_t* pi8_format, ... );
|
||||
void y262_encode_picture( y262_t *ps_y262, y262_picture_t *ps_picture, int32_t i_picture_type, int32_t i_pon );
|
||||
|
|
|
@ -548,6 +548,7 @@ int32_t y262_initialize( void *p_y262, y262_configuration_t *ps_config )
|
|||
|
||||
ps_y262->s_funcs.f_fdct_8x8 = y262_fdct_c;
|
||||
ps_y262->s_funcs.f_idct_8x8 = y262_idct_c;
|
||||
#ifdef ASSEMBLY_X86
|
||||
if( 1 )
|
||||
{
|
||||
ps_y262->s_funcs.rgf_sad[ BLOCK_TYPE_16x16 ] = y262_sad_16x16_sse2;
|
||||
|
@ -565,7 +566,29 @@ int32_t y262_initialize( void *p_y262, y262_configuration_t *ps_config )
|
|||
ps_y262->s_funcs.f_fdct_8x8 = y262_fdct_sse2;
|
||||
ps_y262->s_funcs.f_idct_8x8 = y262_idct_sse2;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef ASSEMBLY_ARM64
|
||||
if( 1 )
|
||||
{
|
||||
ps_y262->s_funcs.rgf_sad[ BLOCK_TYPE_16x16 ] = y262_sad_16x16_neon;
|
||||
ps_y262->s_funcs.rgf_sad[ BLOCK_TYPE_16x8 ] = y262_sad_16x8_neon;
|
||||
ps_y262->s_funcs.rgf_satd[ BLOCK_TYPE_16x16 ] = y262_satd_16x16_neon;
|
||||
ps_y262->s_funcs.rgf_satd[ BLOCK_TYPE_16x8 ] = y262_satd_16x8_neon;
|
||||
|
||||
ps_y262->s_funcs.f_ssd_16x16 = y262_ssd_16x16_neon;
|
||||
ps_y262->s_funcs.f_ssd_8x8 = y262_ssd_8x8_neon;
|
||||
ps_y262->s_funcs.f_add_8x8 = y262_add_8x8_neon;
|
||||
ps_y262->s_funcs.f_sub_8x8 = y262_sub_8x8_neon;
|
||||
ps_y262->s_funcs.f_quant8x8_intra_fw = y262_quant8x8_intra_fw_mpeg2_neon;
|
||||
ps_y262->s_funcs.f_quant8x8_inter_fw = y262_quant8x8_inter_fw_mpeg2_neon;
|
||||
|
||||
ps_y262->s_funcs.f_fdct_8x8 = y262_fdct_neon;
|
||||
ps_y262->s_funcs.f_idct_8x8 = y262_idct_neon;
|
||||
|
||||
|
||||
}
|
||||
#endif
|
||||
|
||||
memset( ps_y262->rgi_y262_motion_bits_x, 0, sizeof( ps_y262->rgi_y262_motion_bits_x ) );
|
||||
memset( ps_y262->rgi_y262_motion_bits_y, 1, sizeof( ps_y262->rgi_y262_motion_bits_y ) );
|
||||
|
|
|
@ -1,12 +1,6 @@
|
|||
cmake_minimum_required(VERSION 3.1)
|
||||
project(y262app)
|
||||
|
||||
if(CMAKE_SIZEOF_VOID_P EQUAL 8)
|
||||
set(ARCH "_x64")
|
||||
else()
|
||||
set(ARCH "_x86")
|
||||
endif()
|
||||
|
||||
set( SRC_FILES
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/main.c
|
||||
)
|
||||
|
@ -14,7 +8,7 @@ set( SRC_FILES
|
|||
add_executable(y262app ${SRC_FILES})
|
||||
target_link_libraries(y262app liby262)
|
||||
set_target_properties(y262app PROPERTIES
|
||||
OUTPUT_NAME "y262$<$<CONFIG:Debug>:d>${ARCH}"
|
||||
OUTPUT_NAME "y262$<$<CONFIG:Debug>:d>"
|
||||
RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/bin"
|
||||
POSITION_INDEPENDENT_CODE ON
|
||||
)
|
||||
|
|
Loading…
Reference in a new issue