removed x86 specific stuff from cmake, added arm64 intrinsic functions. cmake should now work on all platforms.

This commit is contained in:
rwillenbacher 2021-04-25 11:46:13 +02:00
parent 068ffc674c
commit 96bbc6c07e
10 changed files with 1527 additions and 43 deletions

View file

@ -4,32 +4,45 @@ project(liby262)
find_package(Threads)
find_program(YASM_EXE NAMES yasm)
if(CMAKE_SIZEOF_VOID_P EQUAL 8)
set(ARCH "_x64")
if(WIN32)
set(YASM_ARGS -f win32 -m amd64 -DARCH_X86_64 -DPIC)
elseif(APPLE)
set(YASM_ARGS -f macho64 -m amd64 -DARCH_X86_64 -DPIC --prefix=_)
else()
set(YASM_ARGS -f elf64 -m amd64 -DARCH_X86_64 -DPIC)
endif()
else()
set(ARCH "_x86")
if(WIN32)
set(YASM_ARGS -f win32 --prefix=_)
elseif(APPLE)
set(YASM_ARGS -f macho32 --prefix=_)
else()
set(YASM_ARGS -f elf32)
endif()
message( "architecture: ${CMAKE_SYSTEM_PROCESSOR}")
set(Y262_TARGET_ARCH "unknown")
if(CMAKE_SYSTEM_PROCESSOR MATCHES "amd64.*|x86_64.*|AMD64.*")
set(Y262_TARGET_ARCH "intelx86")
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "i686.*|x86.*|i386.*")
set(Y262_TARGET_ARCH "intelx86")
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(arm64.*|aarch64.*|ARM64.*|AARCH64.*)")
set(Y262_TARGET_ARCH "arm64")
endif()
add_custom_command(OUTPUT pixelop_x86.o COMMAND ${YASM_EXE}
ARGS ${YASM_ARGS} -o ${CMAKE_CURRENT_BINARY_DIR}/pixelop_x86.o ${CMAKE_CURRENT_SOURCE_DIR}/pixelop_x86.asm)
add_custom_command(OUTPUT transform_x86.o COMMAND ${YASM_EXE}
ARGS ${YASM_ARGS} -o ${CMAKE_CURRENT_BINARY_DIR}/transform_x86.o ${CMAKE_CURRENT_SOURCE_DIR}/transform_x86.asm)
message( "target_arch: ${Y262_TARGET_ARCH}")
add_library(liby262 STATIC
if(Y262_TARGET_ARCH MATCHES "intelx86")
if(CMAKE_SIZEOF_VOID_P EQUAL 8)
if(WIN32)
set(YASM_ARGS -f win32 -m amd64 -DARCH_X86_64 -DPIC)
elseif(APPLE)
set(YASM_ARGS -f macho64 -m amd64 -DARCH_X86_64 -DPIC --prefix=_)
else()
set(YASM_ARGS -f elf64 -m amd64 -DARCH_X86_64 -DPIC)
endif()
else()
if(WIN32)
set(YASM_ARGS -f win32 --prefix=_)
elseif(APPLE)
set(YASM_ARGS -f macho32 --prefix=_)
else()
set(YASM_ARGS -f elf32)
endif()
endif()
add_custom_command(OUTPUT pixelop_x86.o COMMAND ${YASM_EXE}
ARGS ${YASM_ARGS} -o ${CMAKE_CURRENT_BINARY_DIR}/pixelop_x86.o ${CMAKE_CURRENT_SOURCE_DIR}/pixelop_x86.asm)
add_custom_command(OUTPUT transform_x86.o COMMAND ${YASM_EXE}
ARGS ${YASM_ARGS} -o ${CMAKE_CURRENT_BINARY_DIR}/transform_x86.o ${CMAKE_CURRENT_SOURCE_DIR}/transform_x86.asm)
endif()
set(liby262_sources_basic
${CMAKE_CURRENT_SOURCE_DIR}/aboveslicelevel.h
${CMAKE_CURRENT_SOURCE_DIR}/bitstream.h
${CMAKE_CURRENT_SOURCE_DIR}/lookahead.h
@ -56,17 +69,36 @@ add_library(liby262 STATIC
${CMAKE_CURRENT_SOURCE_DIR}/transform.c
${CMAKE_CURRENT_SOURCE_DIR}/y262.c
${CMAKE_CURRENT_SOURCE_DIR}/y262api.c
${CMAKE_CURRENT_BINARY_DIR}/pixelop_x86.o
${CMAKE_CURRENT_BINARY_DIR}/transform_x86.o
)
set_target_properties(liby262 PROPERTIES
OUTPUT_NAME "liby262$<$<CONFIG:Debug>:d>${ARCH}"
ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib"
LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib"
set(liby262_sources_assembly "")
if(Y262_TARGET_ARCH MATCHES "intelx86")
set(liby262_sources_assembly
${CMAKE_CURRENT_BINARY_DIR}/pixelop_x86.o
${CMAKE_CURRENT_BINARY_DIR}/transform_x86.o
)
elseif(Y262_TARGET_ARCH MATCHES "arm64")
set(liby262_sources_assembly
${CMAKE_CURRENT_SOURCE_DIR}/transform_arm64.c
${CMAKE_CURRENT_SOURCE_DIR}/transform_arm64.h
${CMAKE_CURRENT_SOURCE_DIR}/pixelop_arm64.c
${CMAKE_CURRENT_SOURCE_DIR}/pixelop_arm64.h
)
endif()
add_library(liby262 STATIC
${liby262_sources_basic}
${liby262_sources_assembly}
)
target_include_directories(liby262 PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
if(Y262_TARGET_ARCH MATCHES "intelx86")
add_compile_definitions(ASSEMBLY_X86)
elseif(Y262_TARGET_ARCH MATCHES "arm64")
add_compile_definitions(ASSEMBLY_ARM64)
endif()
if(WIN32)
target_compile_definitions(liby262 PRIVATE WIN32)
@ -77,8 +109,12 @@ else()
target_link_libraries(liby262 PUBLIC m)
endif()
set_target_properties(liby262 PROPERTIES POSITION_INDEPENDENT_CODE ON)
set_target_properties(liby262 PROPERTIES
OUTPUT_NAME "liby262$<$<CONFIG:Debug>:d>${MY_ARCH}"
ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib"
LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib"
POSITION_INDEPENDENT_CODE ON
)
target_include_directories(liby262 PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
target_link_libraries(liby262 PUBLIC Threads::Threads)

View file

@ -30,6 +30,8 @@ POSSIBILITY OF SUCH DAMAGE.
#include "y262.h"
#ifdef ASSEMBLY_X86
void y262_motcomp_16x16_00_put_sse2( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride );
void y262_motcomp_16x16_01_put_sse2( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride );
void y262_motcomp_16x16_10_put_sse2( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride );
@ -80,6 +82,65 @@ void y262_motcomp_8x4_01_avg_mmxext( uint8_t *pui8_src, int32_t i_src_stride, ui
void y262_motcomp_8x4_10_avg_mmxext( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride );
void y262_motcomp_8x4_11_avg_mmxext( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride );
#endif
#ifdef ASSEMBLY_ARM64
void y262_motcomp_16x16_00_put_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride );
void y262_motcomp_16x16_01_put_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride );
void y262_motcomp_16x16_10_put_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride );
void y262_motcomp_16x16_11_put_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride );
void y262_motcomp_16x8_00_put_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride );
void y262_motcomp_16x8_01_put_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride );
void y262_motcomp_16x8_10_put_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride );
void y262_motcomp_16x8_11_put_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride );
void y262_motcomp_8x16_00_put_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride );
void y262_motcomp_8x16_01_put_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride );
void y262_motcomp_8x16_10_put_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride );
void y262_motcomp_8x16_11_put_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride );
void y262_motcomp_8x8_00_put_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride );
void y262_motcomp_8x8_01_put_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride );
void y262_motcomp_8x8_10_put_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride );
void y262_motcomp_8x8_11_put_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride );
void y262_motcomp_8x4_00_put_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride );
void y262_motcomp_8x4_01_put_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride );
void y262_motcomp_8x4_10_put_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride );
void y262_motcomp_8x4_11_put_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride );
void y262_motcomp_16x16_00_avg_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride );
void y262_motcomp_16x16_01_avg_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride );
void y262_motcomp_16x16_10_avg_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride );
void y262_motcomp_16x16_11_avg_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride );
void y262_motcomp_16x8_00_avg_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride );
void y262_motcomp_16x8_01_avg_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride );
void y262_motcomp_16x8_10_avg_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride );
void y262_motcomp_16x8_11_avg_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride );
void y262_motcomp_8x16_00_avg_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride );
void y262_motcomp_8x16_01_avg_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride );
void y262_motcomp_8x16_10_avg_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride );
void y262_motcomp_8x16_11_avg_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride );
void y262_motcomp_8x8_00_avg_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride );
void y262_motcomp_8x8_01_avg_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride );
void y262_motcomp_8x8_10_avg_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride );
void y262_motcomp_8x8_11_avg_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride );
void y262_motcomp_8x4_00_avg_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride );
void y262_motcomp_8x4_01_avg_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride );
void y262_motcomp_8x4_10_avg_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride );
void y262_motcomp_8x4_11_avg_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride );
#endif
@ -322,7 +383,7 @@ void y262_init_motion_compensation( y262_t *ps_y262 )
ps_y262->s_funcs.rgf_motcomp_avg[ MC_BLOCK_8x4 ][ MC_BLOCK_10 ] = y262_motcomp_8x4_10_avg;
ps_y262->s_funcs.rgf_motcomp_avg[ MC_BLOCK_8x4 ][ MC_BLOCK_11 ] = y262_motcomp_8x4_11_avg;
#if 1
#ifdef ASSEMBLY_X86
if( 1 )
{
@ -382,6 +443,70 @@ void y262_init_motion_compensation( y262_t *ps_y262 )
ps_y262->s_funcs.rgf_motcomp_avg[ MC_BLOCK_8x4 ][ MC_BLOCK_11 ] = y262_motcomp_8x4_11_avg_mmxext;
}
#endif
#ifdef ASSEMBLY_ARM64
if( 1 )
{
/* copy */
ps_y262->s_funcs.rgf_motcomp_copy[ MC_BLOCK_16x16 ][ MC_BLOCK_00 ] = y262_motcomp_16x16_00_put_neon;
ps_y262->s_funcs.rgf_motcomp_copy[ MC_BLOCK_16x16 ][ MC_BLOCK_01 ] = y262_motcomp_16x16_01_put_neon;
ps_y262->s_funcs.rgf_motcomp_copy[ MC_BLOCK_16x16 ][ MC_BLOCK_10 ] = y262_motcomp_16x16_10_put_neon;
ps_y262->s_funcs.rgf_motcomp_copy[ MC_BLOCK_16x16 ][ MC_BLOCK_11 ] = y262_motcomp_16x16_11_put_neon;
ps_y262->s_funcs.rgf_motcomp_copy[ MC_BLOCK_16x8 ][ MC_BLOCK_00 ] = y262_motcomp_16x8_00_put_neon;
ps_y262->s_funcs.rgf_motcomp_copy[ MC_BLOCK_16x8 ][ MC_BLOCK_01 ] = y262_motcomp_16x8_01_put_neon;
ps_y262->s_funcs.rgf_motcomp_copy[ MC_BLOCK_16x8 ][ MC_BLOCK_10 ] = y262_motcomp_16x8_10_put_neon;
ps_y262->s_funcs.rgf_motcomp_copy[ MC_BLOCK_16x8 ][ MC_BLOCK_11 ] = y262_motcomp_16x8_11_put_neon;
ps_y262->s_funcs.rgf_motcomp_copy[ MC_BLOCK_8x16 ][ MC_BLOCK_00 ] = y262_motcomp_8x16_00_put_neon;
ps_y262->s_funcs.rgf_motcomp_copy[ MC_BLOCK_8x16 ][ MC_BLOCK_01 ] = y262_motcomp_8x16_01_put_neon;
ps_y262->s_funcs.rgf_motcomp_copy[ MC_BLOCK_8x16 ][ MC_BLOCK_10 ] = y262_motcomp_8x16_10_put_neon;
ps_y262->s_funcs.rgf_motcomp_copy[ MC_BLOCK_8x16 ][ MC_BLOCK_11 ] = y262_motcomp_8x16_11_put_neon;
ps_y262->s_funcs.rgf_motcomp_copy[ MC_BLOCK_8x8 ][ MC_BLOCK_00 ] = y262_motcomp_8x8_00_put_neon;
ps_y262->s_funcs.rgf_motcomp_copy[ MC_BLOCK_8x8 ][ MC_BLOCK_01 ] = y262_motcomp_8x8_01_put_neon;
ps_y262->s_funcs.rgf_motcomp_copy[ MC_BLOCK_8x8 ][ MC_BLOCK_10 ] = y262_motcomp_8x8_10_put_neon;
ps_y262->s_funcs.rgf_motcomp_copy[ MC_BLOCK_8x8 ][ MC_BLOCK_11 ] = y262_motcomp_8x8_11_put_neon;
ps_y262->s_funcs.rgf_motcomp_copy[ MC_BLOCK_8x4 ][ MC_BLOCK_00 ] = y262_motcomp_8x4_00_put_neon;
ps_y262->s_funcs.rgf_motcomp_copy[ MC_BLOCK_8x4 ][ MC_BLOCK_01 ] = y262_motcomp_8x4_01_put_neon;
ps_y262->s_funcs.rgf_motcomp_copy[ MC_BLOCK_8x4 ][ MC_BLOCK_10 ] = y262_motcomp_8x4_10_put_neon;
ps_y262->s_funcs.rgf_motcomp_copy[ MC_BLOCK_8x4 ][ MC_BLOCK_11 ] = y262_motcomp_8x4_11_put_neon;
/* avg */
ps_y262->s_funcs.rgf_motcomp_avg[ MC_BLOCK_16x16 ][ MC_BLOCK_00 ] = y262_motcomp_16x16_00_avg_neon;
ps_y262->s_funcs.rgf_motcomp_avg[ MC_BLOCK_16x16 ][ MC_BLOCK_01 ] = y262_motcomp_16x16_01_avg_neon;
ps_y262->s_funcs.rgf_motcomp_avg[ MC_BLOCK_16x16 ][ MC_BLOCK_10 ] = y262_motcomp_16x16_10_avg_neon;
ps_y262->s_funcs.rgf_motcomp_avg[ MC_BLOCK_16x16 ][ MC_BLOCK_11 ] = y262_motcomp_16x16_11_avg_neon;
ps_y262->s_funcs.rgf_motcomp_avg[ MC_BLOCK_16x8 ][ MC_BLOCK_00 ] = y262_motcomp_16x8_00_avg_neon;
ps_y262->s_funcs.rgf_motcomp_avg[ MC_BLOCK_16x8 ][ MC_BLOCK_01 ] = y262_motcomp_16x8_01_avg_neon;
ps_y262->s_funcs.rgf_motcomp_avg[ MC_BLOCK_16x8 ][ MC_BLOCK_10 ] = y262_motcomp_16x8_10_avg_neon;
ps_y262->s_funcs.rgf_motcomp_avg[ MC_BLOCK_16x8 ][ MC_BLOCK_11 ] = y262_motcomp_16x8_11_avg_neon;
ps_y262->s_funcs.rgf_motcomp_avg[ MC_BLOCK_8x16 ][ MC_BLOCK_00 ] = y262_motcomp_8x16_00_avg_neon;
ps_y262->s_funcs.rgf_motcomp_avg[ MC_BLOCK_8x16 ][ MC_BLOCK_01 ] = y262_motcomp_8x16_01_avg_neon;
ps_y262->s_funcs.rgf_motcomp_avg[ MC_BLOCK_8x16 ][ MC_BLOCK_10 ] = y262_motcomp_8x16_10_avg_neon;
ps_y262->s_funcs.rgf_motcomp_avg[ MC_BLOCK_8x16 ][ MC_BLOCK_11 ] = y262_motcomp_8x16_11_avg_neon;
ps_y262->s_funcs.rgf_motcomp_avg[ MC_BLOCK_8x8 ][ MC_BLOCK_00 ] = y262_motcomp_8x8_00_avg_neon;
ps_y262->s_funcs.rgf_motcomp_avg[ MC_BLOCK_8x8 ][ MC_BLOCK_01 ] = y262_motcomp_8x8_01_avg_neon;
ps_y262->s_funcs.rgf_motcomp_avg[ MC_BLOCK_8x8 ][ MC_BLOCK_10 ] = y262_motcomp_8x8_10_avg_neon;
ps_y262->s_funcs.rgf_motcomp_avg[ MC_BLOCK_8x8 ][ MC_BLOCK_11 ] = y262_motcomp_8x8_11_avg_neon;
ps_y262->s_funcs.rgf_motcomp_avg[ MC_BLOCK_8x4 ][ MC_BLOCK_00 ] = y262_motcomp_8x4_00_avg_neon;
ps_y262->s_funcs.rgf_motcomp_avg[ MC_BLOCK_8x4 ][ MC_BLOCK_01 ] = y262_motcomp_8x4_01_avg_neon;
ps_y262->s_funcs.rgf_motcomp_avg[ MC_BLOCK_8x4 ][ MC_BLOCK_10 ] = y262_motcomp_8x4_10_avg_neon;
ps_y262->s_funcs.rgf_motcomp_avg[ MC_BLOCK_8x4 ][ MC_BLOCK_11 ] = y262_motcomp_8x4_11_avg_neon;
}
#endif
}

View file

@ -226,6 +226,8 @@ int32_t y262_satd_16x8( uint8_t *pui8_blk1, int32_t i_stride1, uint8_t *pui8_blk
return i_satd;
}
#ifdef ASSEMBLY_X86
int32_t y262_satd_16x16_sse2( uint8_t *pui8_blk1, int32_t i_stride1, uint8_t *pui8_blk2, int32_t i_stride2 )
{
int32_t i_satd;
@ -249,6 +251,7 @@ int32_t y262_satd_16x8_sse2( uint8_t *pui8_blk1, int32_t i_stride1, uint8_t *pui
return i_satd;
}
#endif
int32_t y262_ssd_16x16( uint8_t *pui8_blk1, int32_t i_blk1_stride, uint8_t *pui8_blk2, int32_t i_blk2_stride )
{

773
src/y262/pixelop_arm64.c Normal file
View file

@ -0,0 +1,773 @@
/*
Copyright (c) 2013, Ralf Willenbacher
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
*/
#include <arm_neon.h>
#include "y262.h"
int32_t y262_sad_16x8_neon( uint8_t *pui8_blk1, int32_t i_stride1, uint8_t *pui8_blk2, int32_t i_stride2 )
{
int32_t i_sad, i_y;
int64_t i64_sad;
uint8x16_t v16_blk1, v16_blk2;
uint8x8_t v8_a, v8_b;
uint16x8_t v16_res0, v16_res1;
uint32x4_t v16_hadd0;
uint32x2_t v8_hadd1;
uint64x1_t v8_hadd2;
v16_blk1 = vld1q_u8 ( pui8_blk1 );
pui8_blk1 += i_stride1;
v16_blk2 = vld1q_u8 ( pui8_blk2 );
pui8_blk2 += i_stride2;
v8_a = vget_low_u8( v16_blk1 );
v8_b = vget_low_u8( v16_blk2 );
v16_res0 = vabdl_u8 ( v8_a, v8_b );
v8_a = vget_high_u8( v16_blk1 );
v8_b = vget_high_u8( v16_blk2 );
v16_res1 = vabdl_u8 ( v8_a, v8_b );
for( i_y = 1; i_y < 8; i_y++ )
{
v16_blk1 = vld1q_u8 ( pui8_blk1 );
pui8_blk1 += i_stride1;
v16_blk2 = vld1q_u8 ( pui8_blk2 );
pui8_blk2 += i_stride2;
v8_a = vget_low_u8( v16_blk1 );
v8_b = vget_low_u8( v16_blk2 );
v16_res0 = vabal_u8 ( v16_res0, v8_a, v8_b );
v8_a = vget_high_u8( v16_blk1 );
v8_b = vget_high_u8( v16_blk2 );
v16_res1 = vabal_u8 ( v16_res1, v8_a, v8_b );
}
v16_res0 = vaddq_u16( v16_res0, v16_res1 );
v16_hadd0 = vpaddlq_u16( v16_res0 );
v8_hadd1 = vadd_u32( vget_low_u32( v16_hadd0 ), vget_high_u32( v16_hadd0 ) );
v8_hadd2 = vpaddl_u32( v8_hadd1 );
i64_sad = vget_lane_u64( v8_hadd2, 0 );
i_sad = ( int32_t )i64_sad;
return i_sad;
}
int32_t y262_sad_16x16_neon( uint8_t *pui8_blk1, int32_t i_stride1, uint8_t *pui8_blk2, int32_t i_stride2 )
{
int32_t i_sad, i_y;
int64_t i64_sad;
uint8x16_t v16_blk1, v16_blk2;
uint8x8_t v8_a, v8_b;
uint16x8_t v16_res0, v16_res1;
uint32x4_t v16_hadd0;
uint32x2_t v8_hadd1;
uint64x1_t v8_hadd2;
v16_blk1 = vld1q_u8 ( pui8_blk1 );
pui8_blk1 += i_stride1;
v16_blk2 = vld1q_u8 ( pui8_blk2 );
pui8_blk2 += i_stride2;
v8_a = vget_low_u8( v16_blk1 );
v8_b = vget_low_u8( v16_blk2 );
v16_res0 = vabdl_u8 ( v8_a, v8_b );
v8_a = vget_high_u8( v16_blk1 );
v8_b = vget_high_u8( v16_blk2 );
v16_res1 = vabdl_u8 ( v8_a, v8_b );
for( i_y = 1; i_y < 16; i_y++ )
{
v16_blk1 = vld1q_u8 ( pui8_blk1 );
pui8_blk1 += i_stride1;
v16_blk2 = vld1q_u8 ( pui8_blk2 );
pui8_blk2 += i_stride2;
v8_a = vget_low_u8( v16_blk1 );
v8_b = vget_low_u8( v16_blk2 );
v16_res0 = vabal_u8 ( v16_res0, v8_a, v8_b );
v8_a = vget_high_u8( v16_blk1 );
v8_b = vget_high_u8( v16_blk2 );
v16_res1 = vabal_u8 ( v16_res1, v8_a, v8_b );
}
v16_res0 = vaddq_u16( v16_res0, v16_res1 );
v16_hadd0 = vpaddlq_u16( v16_res0 );
v8_hadd1 = vadd_u32( vget_low_u32( v16_hadd0 ), vget_high_u32( v16_hadd0 ) );
v8_hadd2 = vpaddl_u32( v8_hadd1 );
i64_sad = vget_lane_u64( v8_hadd2, 0 );
i_sad = ( int32_t )i64_sad;
return i_sad;
}
#define HADAMARD_NEON_4x2( d0, d1, d2, d3 ) \
{ \
int32x4x2_t v32_a0, v32_b0; \
int16x8x2_t v32_a1, v32_b1; \
\
d0 = vaddq_s16( d0, d1 ); \
d2 = vaddq_s16( d2, d3 ); \
\
d1 = vaddq_s16( d1, d1 ); \
d3 = vaddq_s16( d3, d3 ); \
d1 = vsubq_s16( d1, d0 ); \
d3 = vsubq_s16( d3, d2 ); \
\
d0 = vaddq_s16( d0, d2 ); \
d1 = vaddq_s16( d1, d3 ); \
\
d2 = vaddq_s16( d2, d2 ); \
d3 = vaddq_s16( d3, d3 ); \
d2 = vsubq_s16( d2, d0 ); \
d3 = vsubq_s16( d3, d1 ); \
\
v32_a0 = vtrnq_s32( vreinterpretq_s32_s16( d0 ), vreinterpretq_s32_s16( d2 ) ); \
v32_b0 = vtrnq_s32( vreinterpretq_s32_s16( d1 ), vreinterpretq_s32_s16( d3 ) ); \
v32_a1 = vtrnq_s16( vreinterpretq_s16_s32( v32_a0.val[ 0 ] ), vreinterpretq_s16_s32( v32_b0.val[ 0 ] ) ); \
v32_b1 = vtrnq_s16( vreinterpretq_s16_s32( v32_a0.val[ 1 ] ), vreinterpretq_s16_s32( v32_b0.val[ 1 ] ) ); \
d0 = vcombine_s16( vget_low_s16( v32_a1.val[ 0 ] ), vget_high_s16( v32_a1.val[ 0 ] ) ); \
d1 = vcombine_s16( vget_low_s16( v32_a1.val[ 1 ] ), vget_high_s16( v32_a1.val[ 1 ] ) ); \
d2 = vcombine_s16( vget_low_s16( v32_b1.val[ 1 ] ), vget_high_s16( v32_b1.val[ 1 ] ) ); \
d3 = vcombine_s16( vget_low_s16( v32_b1.val[ 0 ] ), vget_high_s16( v32_b1.val[ 0 ] ) ); \
\
d0 = vaddq_s16( d0, d1 ); \
d2 = vaddq_s16( d2, d3 ); \
\
d1 = vaddq_s16( d1, d1 ); \
d3 = vaddq_s16( d3, d3 ); \
d1 = vsubq_s16( d1, d0 ); \
d3 = vsubq_s16( d3, d2 ); \
\
d0 = vaddq_s16( d0, d2 ); \
d1 = vaddq_s16( d1, d3 ); \
\
d2 = vaddq_s16( d2, d2 ); \
d3 = vaddq_s16( d3, d3 ); \
d2 = vsubq_s16( d2, d0 ); \
d3 = vsubq_s16( d3, d1 ); \
}
#define ADDSUB( d0, d1 ) \
{ \
int16x8_t v16_a, v16_s; \
v16_a = vaddq_s16( d0, d1 ); \
v16_s = vsubq_s16( d1, d0 ); \
d0 = v16_a; \
d1 = v16_s; \
}
#define ABSADDL( sum, vector0, vector1 ) \
{ \
vector0 = vabsq_s16( vector0 ); \
vector1 = vabsq_s16( vector1 ); \
sum = vaddq_s32( sum, vaddl_s16( vget_low_s16( vector0 ), vget_high_s16( vector0 ) ) ); \
sum = vaddq_s32( sum, vaddl_s16( vget_low_s16( vector1 ), vget_high_s16( vector1 ) ) ); \
}
int32_t y262_satd_8x8_neon( uint8_t *pui8_blk1, int32_t i_stride1, uint8_t *pui8_blk2, int32_t i_stride2 )
{
int32_t i_satd;
int64_t i64_satd;
int16x8_t v16_d0, v16_d1, v16_d2, v16_d3, v16_d4, v16_d5, v16_d6, v16_d7;
int16x8_t v16_a0, v16_a1, v16_a2, v16_a3;
int32x4_t v16_res;
int32x2_t v8_hadd0;
int64x1_t v8_hadd1;
v16_res = vmovq_n_s32( 0 );
v16_d0 = vreinterpretq_s16_u16( vsubl_u8( vld1_u8( pui8_blk1 + i_stride1 * 0 ), vld1_u8( pui8_blk2 + i_stride2 * 0 ) ) );
v16_d1 = vreinterpretq_s16_u16( vsubl_u8( vld1_u8( pui8_blk1 + i_stride1 * 1 ), vld1_u8( pui8_blk2 + i_stride2 * 1 ) ) );
v16_d2 = vreinterpretq_s16_u16( vsubl_u8( vld1_u8( pui8_blk1 + i_stride1 * 2 ), vld1_u8( pui8_blk2 + i_stride2 * 2 ) ) );
v16_d3 = vreinterpretq_s16_u16( vsubl_u8( vld1_u8( pui8_blk1 + i_stride1 * 3 ), vld1_u8( pui8_blk2 + i_stride2 * 3 ) ) );
v16_d4 = vreinterpretq_s16_u16( vsubl_u8( vld1_u8( pui8_blk1 + i_stride1 * 4 ), vld1_u8( pui8_blk2 + i_stride2 * 4 ) ) );
v16_d5 = vreinterpretq_s16_u16( vsubl_u8( vld1_u8( pui8_blk1 + i_stride1 * 5 ), vld1_u8( pui8_blk2 + i_stride2 * 5 ) ) );
v16_d6 = vreinterpretq_s16_u16( vsubl_u8( vld1_u8( pui8_blk1 + i_stride1 * 6 ), vld1_u8( pui8_blk2 + i_stride2 * 6 ) ) );
v16_d7 = vreinterpretq_s16_u16( vsubl_u8( vld1_u8( pui8_blk1 + i_stride1 * 7 ), vld1_u8( pui8_blk2 + i_stride2 * 7 ) ) );
HADAMARD_NEON_4x2( v16_d0, v16_d1, v16_d2, v16_d3 );
HADAMARD_NEON_4x2( v16_d4, v16_d5, v16_d6, v16_d7 );
ADDSUB( v16_d0, v16_d4 );
ADDSUB( v16_d1, v16_d5 );
ADDSUB( v16_d2, v16_d6 );
ADDSUB( v16_d3, v16_d7 );
v16_a0 = vcombine_s16( vget_low_s16( v16_d0 ), vget_low_s16( v16_d4 ) );
v16_a1 = vcombine_s16( vget_high_s16( v16_d0 ), vget_high_s16( v16_d4 ) );
ADDSUB( v16_a0, v16_a1 );
ABSADDL( v16_res, v16_a0, v16_a1 );
v16_a0 = vcombine_s16( vget_low_s16( v16_d1 ), vget_low_s16( v16_d5 ) );
v16_a1 = vcombine_s16( vget_high_s16( v16_d1 ), vget_high_s16( v16_d5 ) );
ADDSUB( v16_a0, v16_a1 );
ABSADDL( v16_res, v16_a0, v16_a1 );
v16_a0 = vcombine_s16( vget_low_s16( v16_d2 ), vget_low_s16( v16_d6 ) );
v16_a1 = vcombine_s16( vget_high_s16( v16_d2 ), vget_high_s16( v16_d6 ) );
ADDSUB( v16_a0, v16_a1 );
ABSADDL( v16_res, v16_a0, v16_a1 );
v16_a0 = vcombine_s16( vget_low_s16( v16_d3 ), vget_low_s16( v16_d7 ) );
v16_a1 = vcombine_s16( vget_high_s16( v16_d3 ), vget_high_s16( v16_d7 ) );
ADDSUB( v16_a0, v16_a1 );
ABSADDL( v16_res, v16_a0, v16_a1 );
v8_hadd0 = vadd_s32( vget_low_s32( v16_res ), vget_high_s32( v16_res ) );
v8_hadd1 = vpaddl_s32( v8_hadd0 );
i64_satd = vget_lane_s64( v8_hadd1, 0 );
i_satd = ( int32_t )i64_satd;
i_satd = ( i_satd + 2 ) >> 2;
return i_satd;
}
int32_t y262_satd_16x16_neon( uint8_t *pui8_blk1, int32_t i_stride1, uint8_t *pui8_blk2, int32_t i_stride2 )
{
int32_t i_satd;
i_satd = y262_satd_8x8_neon( pui8_blk1, i_stride1, pui8_blk2, i_stride2 );
i_satd += y262_satd_8x8_neon( pui8_blk1 + 8, i_stride1, pui8_blk2 + 8, i_stride2 );
i_satd += y262_satd_8x8_neon( pui8_blk1 + ( 8 * i_stride1 ), i_stride1, pui8_blk2 + ( 8 * i_stride2 ), i_stride2 );
i_satd += y262_satd_8x8_neon( pui8_blk1 + 8 + ( 8 * i_stride1 ), i_stride1, pui8_blk2 + 8 + ( 8 * i_stride2 ), i_stride2 );
return i_satd;
}
int32_t y262_satd_16x8_neon( uint8_t *pui8_blk1, int32_t i_stride1, uint8_t *pui8_blk2, int32_t i_stride2 )
{
int32_t i_satd;
i_satd = y262_satd_8x8_neon( pui8_blk1, i_stride1, pui8_blk2, i_stride2 );
i_satd += y262_satd_8x8_neon( pui8_blk1 + 8, i_stride1, pui8_blk2 + 8, i_stride2 );
return i_satd;
}
int32_t y262_ssd_8x8_neon( uint8_t *pui8_blk1, int32_t i_blk1_stride, uint8_t *pui8_blk2, int32_t i_blk2_stride )
{
int32_t i_ssd;
int16x8_t v16_d0, v16_d1, v16_d2, v16_d3;
int32x4_t v16_ssd0, v16_ssd1;
int32x2_t v8_hadd0;
int64x1_t v8_hadd1;
v16_d0 = vreinterpretq_s16_u16( vsubl_u8( vld1_u8( pui8_blk1 + ( 0 * i_blk1_stride ) ), vld1_u8( pui8_blk2 + ( 0 * i_blk2_stride ) ) ) );
v16_d1 = vreinterpretq_s16_u16( vsubl_u8( vld1_u8( pui8_blk1 + ( 1 * i_blk1_stride ) ), vld1_u8( pui8_blk2 + ( 1 * i_blk2_stride ) ) ) );
v16_d2 = vreinterpretq_s16_u16( vsubl_u8( vld1_u8( pui8_blk1 + ( 2 * i_blk1_stride ) ), vld1_u8( pui8_blk2 + ( 2 * i_blk2_stride ) ) ) );
v16_d3 = vreinterpretq_s16_u16( vsubl_u8( vld1_u8( pui8_blk1 + ( 3 * i_blk1_stride ) ), vld1_u8( pui8_blk2 + ( 3 * i_blk2_stride ) ) ) );
v16_ssd0 = vmull_s16( vget_low_s16( v16_d0 ), vget_low_s16( v16_d0 ) );
v16_ssd1 = vmull_s16( vget_high_s16( v16_d0 ), vget_high_s16( v16_d0 ) );
v16_ssd0 = vmlal_s16( v16_ssd0, vget_low_s16( v16_d1 ), vget_low_s16( v16_d1 ) );
v16_ssd1 = vmlal_s16( v16_ssd1, vget_high_s16( v16_d1 ), vget_high_s16( v16_d1 ) );
v16_ssd0 = vmlal_s16( v16_ssd0, vget_low_s16( v16_d2 ), vget_low_s16( v16_d2 ) );
v16_ssd1 = vmlal_s16( v16_ssd1, vget_high_s16( v16_d2 ), vget_high_s16( v16_d2 ) );
v16_ssd0 = vmlal_s16( v16_ssd0, vget_low_s16( v16_d3 ), vget_low_s16( v16_d3 ) );
v16_ssd1 = vmlal_s16( v16_ssd1, vget_high_s16( v16_d3 ), vget_high_s16( v16_d3 ) );
v16_d0 = vreinterpretq_s16_u16( vsubl_u8( vld1_u8( pui8_blk1 + ( 4 * i_blk1_stride ) ), vld1_u8( pui8_blk2 + ( 4 * i_blk2_stride ) ) ) );
v16_d1 = vreinterpretq_s16_u16( vsubl_u8( vld1_u8( pui8_blk1 + ( 5 * i_blk1_stride ) ), vld1_u8( pui8_blk2 + ( 5 * i_blk2_stride ) ) ) );
v16_d2 = vreinterpretq_s16_u16( vsubl_u8( vld1_u8( pui8_blk1 + ( 6 * i_blk1_stride ) ), vld1_u8( pui8_blk2 + ( 6 * i_blk2_stride ) ) ) );
v16_d3 = vreinterpretq_s16_u16( vsubl_u8( vld1_u8( pui8_blk1 + ( 7 * i_blk1_stride ) ), vld1_u8( pui8_blk2 + ( 7 * i_blk2_stride ) ) ) );
v16_ssd0 = vmlal_s16( v16_ssd0, vget_low_s16( v16_d0 ), vget_low_s16( v16_d0 ) );
v16_ssd1 = vmlal_s16( v16_ssd1, vget_high_s16( v16_d0 ), vget_high_s16( v16_d0 ) );
v16_ssd0 = vmlal_s16( v16_ssd0, vget_low_s16( v16_d1 ), vget_low_s16( v16_d1 ) );
v16_ssd1 = vmlal_s16( v16_ssd1, vget_high_s16( v16_d1 ), vget_high_s16( v16_d1 ) );
v16_ssd0 = vmlal_s16( v16_ssd0, vget_low_s16( v16_d2 ), vget_low_s16( v16_d2 ) );
v16_ssd1 = vmlal_s16( v16_ssd1, vget_high_s16( v16_d2 ), vget_high_s16( v16_d2 ) );
v16_ssd0 = vmlal_s16( v16_ssd0, vget_low_s16( v16_d3 ), vget_low_s16( v16_d3 ) );
v16_ssd1 = vmlal_s16( v16_ssd1, vget_high_s16( v16_d3 ), vget_high_s16( v16_d3 ) );
v16_ssd0 = vaddq_s32( v16_ssd0, v16_ssd1 );
v8_hadd0 = vadd_s32( vget_low_s32( v16_ssd0 ), vget_high_s32( v16_ssd0 ) );
v8_hadd0 = vpadd_s32( v8_hadd0, v8_hadd0 );
i_ssd = vget_lane_s32( v8_hadd0, 0 );
return i_ssd;
}
int32_t y262_ssd_16x16_neon( uint8_t *pui8_blk1, int32_t i_blk1_stride, uint8_t *pui8_blk2, int32_t i_blk2_stride )
{
int32_t i_ssd;
i_ssd = y262_ssd_8x8_neon( pui8_blk1, i_blk1_stride, pui8_blk2, i_blk2_stride );
i_ssd += y262_ssd_8x8_neon( pui8_blk1 + 8, i_blk1_stride, pui8_blk2 + 8, i_blk2_stride );
i_ssd += y262_ssd_8x8_neon( pui8_blk1 + ( 8 * i_blk1_stride ), i_blk1_stride, pui8_blk2 + ( 8 * i_blk2_stride ), i_blk2_stride );
i_ssd += y262_ssd_8x8_neon( pui8_blk1 + 8 + ( 8 * i_blk1_stride ), i_blk1_stride, pui8_blk2 + 8 + ( 8 * i_blk2_stride), i_blk2_stride );
return i_ssd;
}
void y262_sub_8x8_neon( int16_t *pi16_diff, uint8_t *pui8_src1, int32_t i_stride_src1, uint8_t *pui8_src2, int32_t i_stride_src2 )
{
int16x8_t v16_d0, v16_d1, v16_d2, v16_d3;
v16_d0 = vreinterpretq_s16_u16( vsubl_u8( vld1_u8( pui8_src1 + ( 0 * i_stride_src1 ) ), vld1_u8( pui8_src2 + ( 0 * i_stride_src2 ) ) ) );
v16_d1 = vreinterpretq_s16_u16( vsubl_u8( vld1_u8( pui8_src1 + ( 1 * i_stride_src1 ) ), vld1_u8( pui8_src2 + ( 1 * i_stride_src2 ) ) ) );
v16_d2 = vreinterpretq_s16_u16( vsubl_u8( vld1_u8( pui8_src1 + ( 2 * i_stride_src1 ) ), vld1_u8( pui8_src2 + ( 2 * i_stride_src2 ) ) ) );
v16_d3 = vreinterpretq_s16_u16( vsubl_u8( vld1_u8( pui8_src1 + ( 3 * i_stride_src1 ) ), vld1_u8( pui8_src2 + ( 3 * i_stride_src2 ) ) ) );
vst1q_s16( pi16_diff + 0, v16_d0 );
vst1q_s16( pi16_diff + 8, v16_d1 );
vst1q_s16( pi16_diff + 16, v16_d2 );
vst1q_s16( pi16_diff + 24, v16_d3 );
v16_d0 = vreinterpretq_s16_u16( vsubl_u8( vld1_u8( pui8_src1 + ( 4 * i_stride_src1 ) ), vld1_u8( pui8_src2 + ( 4 * i_stride_src2 ) ) ) );
v16_d1 = vreinterpretq_s16_u16( vsubl_u8( vld1_u8( pui8_src1 + ( 5 * i_stride_src1 ) ), vld1_u8( pui8_src2 + ( 5 * i_stride_src2 ) ) ) );
v16_d2 = vreinterpretq_s16_u16( vsubl_u8( vld1_u8( pui8_src1 + ( 6 * i_stride_src1 ) ), vld1_u8( pui8_src2 + ( 6 * i_stride_src2 ) ) ) );
v16_d3 = vreinterpretq_s16_u16( vsubl_u8( vld1_u8( pui8_src1 + ( 7 * i_stride_src1 ) ), vld1_u8( pui8_src2 + ( 7 * i_stride_src2 ) ) ) );
vst1q_s16( pi16_diff + 32, v16_d0 );
vst1q_s16( pi16_diff + 40, v16_d1 );
vst1q_s16( pi16_diff + 48, v16_d2 );
vst1q_s16( pi16_diff + 56, v16_d3 );
}
void y262_add_8x8_neon( uint8_t *pui8_destination, int32_t i_destination_stride, uint8_t *pui8_base, int32_t i_base_stride, int16_t *pi16_difference )
{
int32_t i_y;
int16x8_t v16_zero = vmovq_n_s16( 0 );
for( i_y = 0; i_y < 8; i_y += 2 )
{
int16x8_t v16_d0, v16_b0, v16_d1, v16_b1;
v16_d0 = vld1q_s16( pi16_difference + ( i_y * 8 ) );
v16_d1 = vld1q_s16( pi16_difference + ( ( i_y + 1 ) * 8 ) );
v16_b0 = vreinterpretq_s16_u16( vshll_n_u8( vld1_u8( pui8_base + ( i_y * i_base_stride ) ), 0 ) );
v16_b1 = vreinterpretq_s16_u16( vshll_n_u8( vld1_u8( pui8_base + ( ( i_y + 1 ) * i_base_stride ) ), 0 ) );
v16_d0 = vaddq_s16( v16_d0, v16_b0 );
v16_d1 = vaddq_s16( v16_d1, v16_b1 );
v16_d0 = vmaxq_s16( v16_zero, v16_d0 );
v16_d1 = vmaxq_s16( v16_zero, v16_d1 );
vst1_u8( pui8_destination + ( i_y * i_destination_stride ), vqmovn_u16( vreinterpretq_u16_s16( v16_d0 ) ) );
vst1_u8( pui8_destination + ( ( i_y + 1 ) * i_destination_stride ), vqmovn_u16( vreinterpretq_u16_s16( v16_d1 ) ) );
}
}
/* MC */
#define MC_FUNC_NEON( name, i_width, i_height, hpelidx ) \
void y262_motcomp_##name##_put_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride ) \
{ \
int32_t i_x, i_y; \
\
if( hpelidx == 0 ) \
{ \
for( i_y = 0; i_y < i_height; i_y++ ) \
{ \
if( i_width == 8 ) \
{ \
uint8x8_t v8_a; \
v8_a = vld1_u8( pui8_src ); \
vst1_u8( pui8_dst, v8_a ); \
} \
else if( i_width == 16 ) \
{ \
uint8x16_t v16_a; \
v16_a = vld1q_u8( pui8_src ); \
vst1q_u8( pui8_dst, v16_a ); \
} \
pui8_src += i_src_stride; \
pui8_dst += i_dst_stride; \
} \
} \
else if( hpelidx == 1 ) \
{ \
uint8_t *pui8_src1, *pui8_src2; \
\
pui8_src1 = pui8_src; \
pui8_src2 = pui8_src + 1; \
\
for( i_y = 0; i_y < i_height; i_y++ ) \
{ \
if( i_width == 8 ) \
{ \
uint8x8_t v8_a, v8_b; \
v8_a = vld1_u8( pui8_src1 ); \
v8_b = vld1_u8( pui8_src2 ); \
vst1_u8( pui8_dst, vrhadd_u8( v8_a, v8_b ) ); \
} \
else if( i_width == 16 ) \
{ \
uint8x16_t v16_a, v16_b; \
v16_a = vld1q_u8( pui8_src1 ); \
v16_b = vld1q_u8( pui8_src2 ); \
vst1q_u8( pui8_dst, vrhaddq_u8( v16_a, v16_b ) ); \
} \
pui8_src1 += i_src_stride; \
pui8_src2 += i_src_stride; \
pui8_dst += i_dst_stride; \
} \
} \
else if( hpelidx == 2 ) \
{ \
uint8_t *pui8_src1, *pui8_src2; \
\
pui8_src1 = pui8_src; \
pui8_src2 = pui8_src + i_src_stride; \
\
for( i_y = 0; i_y < i_height; i_y++ ) \
{ \
if( i_width == 8 ) \
{ \
uint8x8_t v8_a, v8_b; \
v8_a = vld1_u8( pui8_src1 ); \
v8_b = vld1_u8( pui8_src2 ); \
vst1_u8( pui8_dst, vrhadd_u8( v8_a, v8_b ) ); \
} \
else if( i_width == 16 ) \
{ \
uint8x16_t v16_a, v16_b; \
v16_a = vld1q_u8( pui8_src1 ); \
v16_b = vld1q_u8( pui8_src2 ); \
vst1q_u8( pui8_dst, vrhaddq_u8( v16_a, v16_b ) ); \
} \
pui8_src1 += i_src_stride; \
pui8_src2 += i_src_stride; \
pui8_dst += i_dst_stride; \
} \
} \
else \
{ \
uint8_t *pui8_src1, *pui8_src2, *pui8_src3, *pui8_src4; \
uint8x8_t v8_a, v8_b, v8_c, v8_d; \
uint8x16_t v16_a, v16_b, v16_c, v16_d; \
uint8x8_t v8_one = vmov_n_u8( 1 ); \
uint8x16_t v16_one = vmovq_n_u8( 1 ); \
\
pui8_src1 = pui8_src; \
pui8_src2 = pui8_src + 1; \
pui8_src3 = pui8_src + i_src_stride; \
pui8_src4 = pui8_src + i_src_stride + 1; \
\
if( i_width == 8 ) \
{ \
v8_a = vld1_u8( pui8_src1 ); \
v8_b = vld1_u8( pui8_src2 ); \
} \
else if( i_width == 16 ) \
{ \
v16_a = vld1q_u8( pui8_src1 ); \
v16_b = vld1q_u8( pui8_src2 ); \
} \
for( i_y = 0; i_y < i_height; i_y++ ) \
{ \
if( i_width == 8 ) \
{ \
uint8x8_t v8_carry0, v8_carry1; \
v8_c = vld1_u8( pui8_src3 ); \
v8_d = vld1_u8( pui8_src4 ); \
\
v8_carry0 = veor_u8( v8_a, v8_c); \
v8_carry1 = veor_u8( v8_b, v8_d); \
\
v8_a = vrhadd_u8( v8_a, v8_c ); \
v8_b = vrhadd_u8( v8_b, v8_d ); \
v8_carry0 = vorr_u8( v8_carry0, v8_carry1 ); \
\
v8_carry1 = veor_u8( v8_a, v8_b); \
v8_carry0 = vand_u8( v8_carry0, v8_carry1 ); \
v8_carry0 = vand_u8( v8_carry0, v8_one ); \
\
v8_a = vrhadd_u8( v8_a, v8_b ); \
v8_a = vsub_u8( v8_a, v8_carry0 ); \
\
vst1_u8( pui8_dst, v8_a ); \
\
v8_a = v8_c; \
v8_b = v8_d; \
} \
else if( i_width == 16 ) \
{ \
uint8x16_t v16_carry0, v16_carry1; \
v16_c = vld1q_u8( pui8_src3 ); \
v16_d = vld1q_u8( pui8_src4 ); \
\
v16_carry0 = veorq_u8( v16_a, v16_c); \
v16_carry1 = veorq_u8( v16_b, v16_d); \
\
v16_a = vrhaddq_u8( v16_a, v16_c ); \
v16_b = vrhaddq_u8( v16_b, v16_d ); \
v16_carry0 = vorrq_u8( v16_carry0, v16_carry1 ); \
\
v16_carry1 = veorq_u8( v16_a, v16_b); \
v16_carry0 = vandq_u8( v16_carry0, v16_carry1 ); \
v16_carry0 = vandq_u8( v16_carry0, v16_one ); \
\
v16_a = vrhaddq_u8( v16_a, v16_b ); \
v16_a = vsubq_u8( v16_a, v16_carry0 ); \
\
vst1q_u8( pui8_dst, v16_a ); \
\
v16_a = v16_c; \
v16_b = v16_d; \
} \
pui8_src1 += i_src_stride; \
pui8_src2 += i_src_stride; \
pui8_src3 += i_src_stride; \
pui8_src4 += i_src_stride; \
pui8_dst += i_dst_stride; \
} \
} \
} \
\
\
void y262_motcomp_##name##_avg_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride ) \
{ \
int32_t i_x, i_y; \
\
if( hpelidx == 0 ) \
{ \
for( i_y = 0; i_y < i_height; i_y++ ) \
{ \
if( i_width == 8 ) \
{ \
uint8x8_t v8_a, v8_z; \
v8_a = vld1_u8( pui8_src ); \
v8_z = vld1_u8( pui8_dst ); \
vst1_u8( pui8_dst, vrhadd_u8( v8_a, v8_z ) ); \
} \
else if( i_width == 16 ) \
{ \
uint8x16_t v16_a, v16_z; \
v16_a = vld1q_u8( pui8_src ); \
v16_z = vld1q_u8( pui8_dst ); \
vst1q_u8( pui8_dst, vrhaddq_u8( v16_a, v16_z ) ); \
} \
pui8_src += i_src_stride; \
pui8_dst += i_dst_stride; \
} \
} \
else if( hpelidx == 1 ) \
{ \
uint8_t *pui8_src1, *pui8_src2; \
\
pui8_src1 = pui8_src; \
pui8_src2 = pui8_src + 1; \
\
for( i_y = 0; i_y < i_height; i_y++ ) \
{ \
if( i_width == 8 ) \
{ \
uint8x8_t v8_a, v8_b, v8_z; \
v8_a = vld1_u8( pui8_src1 ); \
v8_b = vld1_u8( pui8_src2 ); \
v8_z = vld1_u8( pui8_dst ); \
v8_a = vrhadd_u8( v8_a, v8_b ); \
vst1_u8( pui8_dst, vrhadd_u8( v8_a, v8_z ) ); \
} \
else if( i_width == 16 ) \
{ \
uint8x16_t v16_a, v16_b, v16_z; \
v16_a = vld1q_u8( pui8_src1 ); \
v16_b = vld1q_u8( pui8_src2 ); \
v16_z = vld1q_u8( pui8_dst ); \
v16_a = vrhaddq_u8( v16_a, v16_b ); \
vst1q_u8( pui8_dst, vrhaddq_u8( v16_a, v16_z ) ); \
} \
pui8_src1 += i_src_stride; \
pui8_src2 += i_src_stride; \
pui8_dst += i_dst_stride; \
} \
} \
else if( hpelidx == 2 ) \
{ \
uint8_t *pui8_src1, *pui8_src2; \
\
pui8_src1 = pui8_src; \
pui8_src2 = pui8_src + i_src_stride; \
\
for( i_y = 0; i_y < i_height; i_y++ ) \
{ \
if( i_width == 8 ) \
{ \
uint8x8_t v8_a, v8_b, v8_z; \
v8_a = vld1_u8( pui8_src1 ); \
v8_b = vld1_u8( pui8_src2 ); \
v8_z = vld1_u8( pui8_dst ); \
v8_a = vrhadd_u8( v8_a, v8_b ); \
vst1_u8( pui8_dst, vrhadd_u8( v8_a, v8_z ) ); \
} \
else if( i_width == 16 ) \
{ \
uint8x16_t v16_a, v16_b, v16_z; \
v16_a = vld1q_u8( pui8_src1 ); \
v16_b = vld1q_u8( pui8_src2 ); \
v16_z = vld1q_u8( pui8_dst ); \
v16_a = vrhaddq_u8( v16_a, v16_b ); \
vst1q_u8( pui8_dst, vrhaddq_u8( v16_a, v16_z ) ); \
} \
pui8_src1 += i_src_stride; \
pui8_src2 += i_src_stride; \
pui8_dst += i_dst_stride; \
} \
} \
else \
{ \
uint8_t *pui8_src1, *pui8_src2, *pui8_src3, *pui8_src4; \
uint8x8_t v8_a, v8_b, v8_c, v8_d, v8_z; \
uint8x16_t v16_a, v16_b, v16_c, v16_d, v16_z; \
uint8x8_t v8_one = vmov_n_u8( 1 ); \
uint8x16_t v16_one = vmovq_n_u8( 1 ); \
\
pui8_src1 = pui8_src; \
pui8_src2 = pui8_src + 1; \
pui8_src3 = pui8_src + i_src_stride; \
pui8_src4 = pui8_src + i_src_stride + 1; \
\
if( i_width == 8 ) \
{ \
v8_a = vld1_u8( pui8_src1 ); \
v8_b = vld1_u8( pui8_src2 ); \
} \
else if( i_width == 16 ) \
{ \
v16_a = vld1q_u8( pui8_src1 ); \
v16_b = vld1q_u8( pui8_src2 ); \
} \
for( i_y = 0; i_y < i_height; i_y++ ) \
{ \
if( i_width == 8 ) \
{ \
uint8x8_t v8_carry0, v8_carry1; \
v8_c = vld1_u8( pui8_src3 ); \
v8_d = vld1_u8( pui8_src4 ); \
v8_z = vld1_u8( pui8_dst ); \
\
v8_carry0 = veor_u8( v8_a, v8_c); \
v8_carry1 = veor_u8( v8_b, v8_d); \
\
v8_a = vrhadd_u8( v8_a, v8_c ); \
v8_b = vrhadd_u8( v8_b, v8_d ); \
v8_carry0 = vorr_u8( v8_carry0, v8_carry1 ); \
\
v8_carry1 = veor_u8( v8_a, v8_b); \
v8_carry0 = vand_u8( v8_carry0, v8_carry1 ); \
v8_carry0 = vand_u8( v8_carry0, v8_one ); \
\
v8_a = vrhadd_u8( v8_a, v8_b ); \
v8_a = vsub_u8( v8_a, v8_carry0 ); \
\
vst1_u8( pui8_dst, vrhadd_u8( v8_a, v8_z ) ); \
\
v8_a = v8_c; \
v8_b = v8_d; \
} \
else if( i_width == 16 ) \
{ \
uint8x16_t v16_carry0, v16_carry1; \
v16_c = vld1q_u8( pui8_src3 ); \
v16_d = vld1q_u8( pui8_src4 ); \
v16_z = vld1q_u8( pui8_dst ); \
\
v16_carry0 = veorq_u8( v16_a, v16_c); \
v16_carry1 = veorq_u8( v16_b, v16_d); \
\
v16_a = vrhaddq_u8( v16_a, v16_c ); \
v16_b = vrhaddq_u8( v16_b, v16_d ); \
v16_carry0 = vorrq_u8( v16_carry0, v16_carry1 ); \
\
v16_carry1 = veorq_u8( v16_a, v16_b); \
v16_carry0 = vandq_u8( v16_carry0, v16_carry1 ); \
v16_carry0 = vandq_u8( v16_carry0, v16_one ); \
\
v16_a = vrhaddq_u8( v16_a, v16_b ); \
v16_a = vsubq_u8( v16_a, v16_carry0 ); \
\
vst1q_u8( pui8_dst, vrhaddq_u8( v16_a, v16_z ) ); \
\
v16_a = v16_c; \
v16_b = v16_d; \
} \
pui8_src1 += i_src_stride; \
pui8_src2 += i_src_stride; \
pui8_src3 += i_src_stride; \
pui8_src4 += i_src_stride; \
pui8_dst += i_dst_stride; \
} \
} \
} \
MC_FUNC_NEON( 16x16_00, 16, 16, 0 );
MC_FUNC_NEON( 16x16_01, 16, 16, 1 );
MC_FUNC_NEON( 16x16_10, 16, 16, 2 );
MC_FUNC_NEON( 16x16_11, 16, 16, 3 );
MC_FUNC_NEON( 16x8_00, 16, 8, 0 );
MC_FUNC_NEON( 16x8_01, 16, 8, 1 );
MC_FUNC_NEON( 16x8_10, 16, 8, 2 );
MC_FUNC_NEON( 16x8_11, 16, 8, 3 );
MC_FUNC_NEON( 8x16_00, 8, 16, 0 );
MC_FUNC_NEON( 8x16_01, 8, 16, 1 );
MC_FUNC_NEON( 8x16_10, 8, 16, 2 );
MC_FUNC_NEON( 8x16_11, 8, 16, 3 );
MC_FUNC_NEON( 8x8_00, 8, 8, 0 );
MC_FUNC_NEON( 8x8_01, 8, 8, 1 );
MC_FUNC_NEON( 8x8_10, 8, 8, 2 );
MC_FUNC_NEON( 8x8_11, 8, 8, 3 );
MC_FUNC_NEON( 8x4_00, 8, 4, 0 );
MC_FUNC_NEON( 8x4_01, 8, 4, 1 );
MC_FUNC_NEON( 8x4_10, 8, 4, 2 );
MC_FUNC_NEON( 8x4_11, 8, 4, 3 );

42
src/y262/pixelop_arm64.h Normal file
View file

@ -0,0 +1,42 @@
/*
Copyright (c) 2013, Ralf Willenbacher
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
*/
int32_t y262_sad_16x16_neon( uint8_t *pui8_blk1, int32_t i_stride1, uint8_t *pui8_blk2, int32_t i_stride2 );
int32_t y262_sad_16x8_neon( uint8_t *pui8_blk1, int32_t i_stride1, uint8_t *pui8_blk2, int32_t i_stride2 );
int32_t y262_satd_16x16_neon( uint8_t *pui8_blk1, int32_t i_stride1, uint8_t *pui8_blk2, int32_t i_stride2 );
int32_t y262_satd_16x8_neon( uint8_t *pui8_blk1, int32_t i_stride1, uint8_t *pui8_blk2, int32_t i_stride2 );
int32_t y262_ssd_8x8_neon( uint8_t *pui8_blk1, int32_t i_blk1_stride, uint8_t *pui8_blk2, int32_t i_blk2_stride );
int32_t y262_ssd_16x16_neon( uint8_t *pui8_blk1, int32_t i_blk1_stride, uint8_t *pui8_blk2, int32_t i_blk2_stride );
void y262_sub_8x8_neon( int16_t *pi16_diff, uint8_t *pui8_src1, int32_t i_stride_src1, uint8_t *pui8_src2, int32_t i_stride_src2 );
void y262_add_8x8_neon( uint8_t *pui8_destination, int32_t i_destination_stride, uint8_t *pui8_base, int32_t i_base_stride, int16_t *pi16_difference );

444
src/y262/transform_arm64.c Normal file
View file

@ -0,0 +1,444 @@
/*
Copyright (c) 2013, Ralf Willenbacher
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
*/
//#ifdef ASSEMBLY_ARM64
#include <arm_neon.h>
#include "y262.h"
#define RND1BITS ( 11 )
#define RND2BITS ( 31 - RND1BITS )
static const int16_t rgi16_y262_fdct_neon_cs1[ 8 ][ 8 ] = {
{ 16383, 16383, 16383, 16383, 16383, 16383, 16383, 16383 },
{ 22724, 19265, 12872, 4520, -4520, -12872, -19265, -22724 },
{ 21406, 8867, -8867, -21406, -21406, -8867, 8867, 21406 },
{ 19265, -4520, -22724, -12872, 12872, 22724, 4520, -19265 },
{ 16383, -16383, -16383, 16383, 16383, -16383, -16383, 16383 },
{ 12872, -22724, 4520, 19265, -19265, -4520, 22724, -12872 },
{ 8867, -21406, 21406, -8867, -8867, 21406, -21406, 8867 },
{ 4520, -12872, 19265, -22724, 22724, -19265, 12872, -4520 },
};
static const int16_t rgi16_y262_fdct_neon_cs2[ 32 ] = {
16385, 16385, 22726, 19266, -8867, -21408, -22726, -12873,
16385, 16385, 12873, 4521, 21408, 8867, 19266, -4521,
16385, -16385, 12873, -22726, 21408, -8867, 19266, -22726,
-16385, 16385, 4521, 19266, 8867, -21408, 4521, -12873,
};
void y262_fdct_neon( int16_t *pi16_block, int16_t *pi16_dst )
{
int i_i, i_j, i_k;
int16x8_t rgv16_tmp[ 8 ], rgv16_dsts[ 8 ];
int16x8_t rgv16_e[ 4 ], rgv16_ee[ 2 ];
int32x4x2_t rgv32_transA[ 4 ];
int16x8x2_t rgv32_transB[ 4 ];
int16x4_t v8_mt0, v8_mt1, v8_mt2, v8_mt3, v8_mt4, v8_mt5, v8_mt6, v8_mt7;
#define RND( x, y ) ( ( ( x ) + ( ( y ) ? ( 1 << ( y - 1 ) ) : 0 ) ) >> ( y ) )
#define MUL( x, m ) ( ( x ) * ( m ) )
for( i_j = 1; i_j < 8; i_j += 2 )
{
int16x8_t v16_d;
int32x4_t v16_s0, v16_s1;
v16_d = vsubq_s16( vld1q_s16( &pi16_block[ 8 * 0 ] ), vld1q_s16( &pi16_block[ 8 * 7 ] ) );
v16_s0 = vmull_n_s16( vget_low_s16( v16_d ), rgi16_y262_fdct_neon_cs1[ i_j ][ 0 ] );
v16_s1 = vmull_n_s16( vget_high_s16( v16_d ), rgi16_y262_fdct_neon_cs1[ i_j ][ 0 ] );
for( i_k = 1; i_k < 4; i_k++ )
{
v16_d = vsubq_s16( vld1q_s16( &pi16_block[ 8 * i_k ] ), vld1q_s16( &pi16_block[ 8 * ( 7 - i_k ) ] ) );
v16_s0 = vmlal_n_s16( v16_s0, vget_low_s16( v16_d ), rgi16_y262_fdct_neon_cs1[ i_j ][ i_k ] );
v16_s1 = vmlal_n_s16( v16_s1, vget_high_s16( v16_d ), rgi16_y262_fdct_neon_cs1[ i_j ][ i_k ] );
}
rgv16_tmp[ i_j ] = vcombine_s16( vqrshrn_n_s32( v16_s0, RND1BITS ), vqrshrn_n_s32( v16_s1, RND1BITS ) );
}
for ( i_k = 0; i_k < 4; i_k++ )
{
rgv16_e[ i_k ] = vaddq_s16( vld1q_s16( &pi16_block[ 8 * i_k ] ), vld1q_s16( &pi16_block[ 8 * ( 7 - i_k ) ] ) );
}
for( i_j = 2; i_j < 8; i_j += 4 )
{
int16x8_t v16_d;
int32x4_t v16_s0, v16_s1;
v16_d = vsubq_s16( rgv16_e[ 0 ], rgv16_e[ 3 ] );
v16_s0 = vmull_n_s16( vget_low_s16( v16_d ), rgi16_y262_fdct_neon_cs1[ i_j ][ 0 ] );
v16_s1 = vmull_n_s16( vget_high_s16( v16_d ), rgi16_y262_fdct_neon_cs1[ i_j ][ 0 ] );
v16_d = vsubq_s16( rgv16_e[ 1 ], rgv16_e[ 2 ] );
v16_s0 = vmlal_n_s16( v16_s0, vget_low_s16( v16_d ), rgi16_y262_fdct_neon_cs1[ i_j ][ 1 ] );
v16_s1 = vmlal_n_s16( v16_s1, vget_high_s16( v16_d ), rgi16_y262_fdct_neon_cs1[ i_j ][ 1 ] );
rgv16_tmp[ i_j ] = vcombine_s16( vqrshrn_n_s32( v16_s0, RND1BITS ), vqrshrn_n_s32( v16_s1, RND1BITS ) );
}
for ( i_k = 0; i_k < 2; i_k++ )
{
rgv16_ee[ i_k ] = vaddq_s16( rgv16_e[ i_k ], rgv16_e[ 3 - i_k ] );
}
for( i_j = 0; i_j < 8; i_j += 4 )
{
int16x8_t v16_d;
int32x4_t v16_s0, v16_s1;
v16_s0 = vmull_n_s16( vget_low_s16( rgv16_ee[ 0 ] ), rgi16_y262_fdct_neon_cs1[ i_j ][ 0 ] );
v16_s1 = vmull_n_s16( vget_high_s16( rgv16_ee[ 0 ] ), rgi16_y262_fdct_neon_cs1[ i_j ][ 0 ] );
v16_s0 = vmlal_n_s16( v16_s0, vget_low_s16( rgv16_ee[ 1 ] ), rgi16_y262_fdct_neon_cs1[ i_j ][ 1 ] );
v16_s1 = vmlal_n_s16( v16_s1, vget_high_s16( rgv16_ee[ 1 ] ), rgi16_y262_fdct_neon_cs1[ i_j ][ 1 ] );
rgv16_tmp[ i_j ] = vcombine_s16( vqrshrn_n_s32( v16_s0, RND1BITS ), vqrshrn_n_s32( v16_s1, RND1BITS ) );
}
v8_mt0 = vld1_s16( &rgi16_y262_fdct_neon_cs2[ 0 ] );
v8_mt1 = vld1_s16( &rgi16_y262_fdct_neon_cs2[ 4 ] );
v8_mt2 = vld1_s16( &rgi16_y262_fdct_neon_cs2[ 8 ] );
v8_mt3 = vld1_s16( &rgi16_y262_fdct_neon_cs2[ 12 ] );
v8_mt4 = vld1_s16( &rgi16_y262_fdct_neon_cs2[ 16 ] );
v8_mt5 = vld1_s16( &rgi16_y262_fdct_neon_cs2[ 20 ] );
v8_mt6 = vld1_s16( &rgi16_y262_fdct_neon_cs2[ 24 ] );
v8_mt7 = vld1_s16( &rgi16_y262_fdct_neon_cs2[ 28 ] );
for( i_j = 0; i_j < 8; i_j++ )
{
int16x4_t v8_l0, v8_l1, v8_o, v8_e, v8_m0, v8_m1, v8_m2, v8_m3;
int16x4x2_t v16_trn0;
int32x2x2_t v16_trn1;
int32x4_t v16_s0, v16_s1, v16_s2, v16_s3;
int16x8_t v16_row;
v8_l0 = vget_low_s16( rgv16_tmp[ i_j ] );
v8_l1 = vget_high_s16( rgv16_tmp[ i_j ] );
v8_l1 = vrev64_s16( v8_l1 );
v8_o = vsub_s16( v8_l0, v8_l1 );
v8_e = vadd_s16( v8_l0, v8_l1 );
v16_trn1 = vzip_s32( vreinterpret_s32_s16( v8_e ), vreinterpret_s32_s16( v8_o ) );
v8_m0 = vreinterpret_s16_s32( v16_trn1.val[ 0 ] );
v8_m1 = vreinterpret_s16_s32( v16_trn1.val[ 1 ] );
v16_s0 = vmull_s16( v8_m0, v8_mt0 );
v16_s1 = vmull_s16( v8_m1, v8_mt1 );
v16_s0 = vmlal_s16( v16_s0, v8_m1, v8_mt2 );
v16_s1 = vmlal_s16( v16_s1, v8_m0, v8_mt3 );
v16_s2 = vmull_s16( v8_m0, v8_mt4 );
v16_s3 = vmull_s16( v8_m1, v8_mt5 );
v16_s2 = vmlal_s16( v16_s2, v8_m1, v8_mt6 );
v16_s3 = vmlal_s16( v16_s3, v8_m0, v8_mt7 );
v16_s0 = vpaddq_s32( v16_s0, v16_s1 );
v16_s1 = vpaddq_s32( v16_s2, v16_s3 );
v16_row = vcombine_s16( vmovn_s32( vrshrq_n_s32( v16_s0, RND2BITS ) ), vmovn_s32(vrshrq_n_s32( v16_s1, RND2BITS ) ) );
vst1q_s16( pi16_dst + ( 8 * i_j ), v16_row );
}
}
#define RND1BITS ( 11 )
#define RND2BITS ( 31 - RND1BITS )
static const int16_t rgi16_y262_idct_cs1[ 8 ][ 8 ] = {
{ 16383, 16383, 16383, 16383, 16383, 16383, 16383, 16383 },
{ 22724, 19265, 12872, 4520, -4520, -12872, -19265, -22724 },
{ 21406, 8867, -8867, -21406, -21406, -8867, 8867, 21406 },
{ 19265, -4520, -22724, -12872, 12872, 22724, 4520, -19265 },
{ 16383, -16383, -16383, 16383, 16383, -16383, -16383, 16383 },
{ 12872, -22724, 4520, 19265, -19265, -4520, 22724, -12872 },
{ 8867, -21406, 21406, -8867, -8867, 21406, -21406, 8867 },
{ 4520, -12872, 19265, -22724, 22724, -19265, 12872, -4520 },
};
static const int16_t rgi16_y262_idct_cs2[ 8 ][ 8 ] = {
{ 16385, 16385, 16385, 16385, 16385, 16385, 16385, 16385 },
{ 22726, 19266, 12873, 4521, -4521, -12873, -19266, -22726 },
{ 21408, 8867, -8867, -21408, -21408, -8867, 8867, 21408 },
{ 19266, -4521, -22726, -12873, 12873, 22726, 4521, -19266 },
{ 16385, -16385, -16385, 16385, 16385, -16385, -16385, 16385 },
{ 12873, -22726, 4521, 19266, -19266, -4521, 22726, -12873 },
{ 8867, -21408, 21408, -8867, -8867, 21408, -21408, 8867 },
{ 4521, -12873, 19266, -22726, 22726, -19266, 12873, -4521 },
};
static const int16_t rgi16_y262_idct_neon_cs2[ 32 ] = {
16385, 21408, 16385, 8867, 16385, -8867, 16385, -21408,
16385, 8867, -16385, -21408, -16385, 21408, 16385, -8867,
22726, 19266, 19266, -4521, 12873, -22726, 4521, -12873,
12873, 4521, -22726, -12873, 4521, 19266, 19266, -22726
};
void y262_idct_neon( int16_t *pi16_block, int16_t *pi16_dst )
{
int i_j, i_k;
int16_t rgi16_tmp[ 64 ];
int32_t rgi_e[ 4 ], rgi_o[ 4 ];
int32_t rgi_ee[ 2 ], rgi_eo[ 2 ];
int32x4_t rgv16_o[ 4 ];
int32x4_t rgv16_eo[ 4 ];
int32x4_t rgv16_ee[ 4 ];
int32x4_t rgv16_e[ 4 ];
int16x4_t rgv8_tmp[ 8 ][ 2 ];
int16x4_t v8_mt0, v8_mt1, v8_mt2, v8_mt3, v8_mt4, v8_mt5, v8_mt6, v8_mt7;
#define RND( x, y ) ( ( ( x ) + ( ( y ) ? ( 1 << ( y - 1 ) ) : 0 ) ) >> ( y ) )
#define MUL( x, m ) ( ( x ) * ( m ) )
for( i_j = 0; i_j < 2; i_j++ )
{
int16x4_t v8_b0, v8_b1, v8_b2, v8_b3;
v8_b0 = vld1_s16( pi16_block + ( 8 * 1 ) + ( i_j * 4 ) );
v8_b1 = vld1_s16( pi16_block + ( 8 * 3 ) + ( i_j * 4 ) );
v8_b2 = vld1_s16( pi16_block + ( 8 * 5 ) + ( i_j * 4 ) );
v8_b3 = vld1_s16( pi16_block + ( 8 * 7 ) + ( i_j * 4 ) );
rgv16_o[ 0 ] = vmull_n_s16( v8_b0, rgi16_y262_idct_cs1[ 1 ][ 0 ] );
rgv16_o[ 0 ] = vmlal_n_s16( rgv16_o[ 0 ], v8_b1, rgi16_y262_idct_cs1[ 3 ][ 0 ] );
rgv16_o[ 0 ] = vmlal_n_s16( rgv16_o[ 0 ], v8_b2, rgi16_y262_idct_cs1[ 5 ][ 0 ] );
rgv16_o[ 0 ] = vmlal_n_s16( rgv16_o[ 0 ], v8_b3, rgi16_y262_idct_cs1[ 7 ][ 0 ] );
rgv16_o[ 1 ] = vmull_n_s16( v8_b0, rgi16_y262_idct_cs1[ 1 ][ 1 ] );
rgv16_o[ 1 ] = vmlal_n_s16( rgv16_o[ 1 ], v8_b1, rgi16_y262_idct_cs1[ 3 ][ 1 ] );
rgv16_o[ 1 ] = vmlal_n_s16( rgv16_o[ 1 ], v8_b2, rgi16_y262_idct_cs1[ 5 ][ 1 ] );
rgv16_o[ 1 ] = vmlal_n_s16( rgv16_o[ 1 ], v8_b3, rgi16_y262_idct_cs1[ 7 ][ 1 ] );
rgv16_o[ 2 ] = vmull_n_s16( v8_b0, rgi16_y262_idct_cs1[ 1 ][ 2 ] );
rgv16_o[ 2 ] = vmlal_n_s16( rgv16_o[ 2 ], v8_b1, rgi16_y262_idct_cs1[ 3 ][ 2 ] );
rgv16_o[ 2 ] = vmlal_n_s16( rgv16_o[ 2 ], v8_b2, rgi16_y262_idct_cs1[ 5 ][ 2 ] );
rgv16_o[ 2 ] = vmlal_n_s16( rgv16_o[ 2 ], v8_b3, rgi16_y262_idct_cs1[ 7 ][ 2 ] );
rgv16_o[ 3 ] = vmull_n_s16( v8_b0, rgi16_y262_idct_cs1[ 1 ][ 3 ] );
rgv16_o[ 3 ] = vmlal_n_s16( rgv16_o[ 3 ], v8_b1, rgi16_y262_idct_cs1[ 3 ][ 3 ] );
rgv16_o[ 3 ] = vmlal_n_s16( rgv16_o[ 3 ], v8_b2, rgi16_y262_idct_cs1[ 5 ][ 3 ] );
rgv16_o[ 3 ] = vmlal_n_s16( rgv16_o[ 3 ], v8_b3, rgi16_y262_idct_cs1[ 7 ][ 3 ] );
v8_b0 = vld1_s16( pi16_block + ( 8 * 2 ) + ( i_j * 4 ) );
v8_b1 = vld1_s16( pi16_block + ( 8 * 6 ) + ( i_j * 4 ) );
v8_b2 = vld1_s16( pi16_block + ( 8 * 0 ) + ( i_j * 4 ) );
v8_b3 = vld1_s16( pi16_block + ( 8 * 4 ) + ( i_j * 4 ) );
rgv16_eo[ 0 ] = vmull_n_s16( v8_b0, rgi16_y262_idct_cs1[ 2 ][ 0 ] );
rgv16_eo[ 0 ] = vmlal_n_s16( rgv16_eo[ 0 ], v8_b1, rgi16_y262_idct_cs1[ 6 ][ 0 ] );
rgv16_eo[ 1 ] = vmull_n_s16( v8_b0, rgi16_y262_idct_cs1[ 2 ][ 1 ] );
rgv16_eo[ 1 ] = vmlal_n_s16( rgv16_eo[ 1 ], v8_b1, rgi16_y262_idct_cs1[ 6 ][ 1 ] );
rgv16_ee[ 0 ] = vmull_n_s16( v8_b2, rgi16_y262_idct_cs1[ 0 ][ 0 ] );
rgv16_ee[ 0 ] = vmlal_n_s16( rgv16_ee[ 0 ], v8_b3, rgi16_y262_idct_cs1[ 4 ][ 0 ] );
rgv16_ee[ 1 ] = vmull_n_s16( v8_b2, rgi16_y262_idct_cs1[ 0 ][ 1 ] );
rgv16_ee[ 1 ] = vmlal_n_s16( rgv16_ee[ 1 ], v8_b3, rgi16_y262_idct_cs1[ 4 ][ 1 ] );
rgv16_e[ 0 ] = vaddq_s32( rgv16_ee[ 0 ], rgv16_eo[ 0 ] );
rgv16_e[ 1 ] = vaddq_s32( rgv16_ee[ 1 ], rgv16_eo[ 1 ] );
rgv16_e[ 2 ] = vsubq_s32( rgv16_ee[ 1 ], rgv16_eo[ 1 ] );
rgv16_e[ 3 ] = vsubq_s32( rgv16_ee[ 0 ], rgv16_eo[ 0 ] );
for( i_k = 0; i_k < 4; i_k++ )
{
int32x4_t v16_eoa, v16_eos;
v16_eoa = vaddq_s32( rgv16_e[ i_k ], rgv16_o[ i_k ]);
rgv8_tmp[ i_k ][ i_j ] = vqrshrn_n_s32( v16_eoa, RND1BITS );
v16_eos = vsubq_s32( rgv16_e[ 3 - i_k ], rgv16_o[ 3 - i_k ]);
rgv8_tmp[ i_k + 4 ][ i_j ] = vqrshrn_n_s32( v16_eos, RND1BITS );
}
}
v8_mt0 = vld1_s16( &rgi16_y262_idct_neon_cs2[ 0 ] );
v8_mt1 = vld1_s16( &rgi16_y262_idct_neon_cs2[ 4 ] );
v8_mt2 = vld1_s16( &rgi16_y262_idct_neon_cs2[ 8 ] );
v8_mt3 = vld1_s16( &rgi16_y262_idct_neon_cs2[ 12 ] );
v8_mt4 = vld1_s16( &rgi16_y262_idct_neon_cs2[ 16 ] );
v8_mt5 = vld1_s16( &rgi16_y262_idct_neon_cs2[ 20 ] );
v8_mt6 = vld1_s16( &rgi16_y262_idct_neon_cs2[ 24 ] );
v8_mt7 = vld1_s16( &rgi16_y262_idct_neon_cs2[ 28 ] );
for( i_j = 0; i_j < 8; i_j++ )
{
int16x4_t v8_l02, v8_l46, v8_l13, v8_l57, v8_m0, v8_m1, v8_m2, v8_m3, v8_m4, v8_m5, v8_m6, v8_m7;
int16x4x2_t v16_trn0;
int32x2x2_t v16_trn1;
int32x4_t v16_s0, v16_s1, v16_s2, v16_s3, v16_s4, v16_s5, v16_s6, v16_s7, v16_e, v16_o;
int16x8_t v16_row;
v8_m0 = rgv8_tmp[ i_j ][ 0 ];
v8_m1 = rgv8_tmp[ i_j ][ 1 ];
v16_trn1 = vtrn_s32( vreinterpret_s32_s16( v8_m0 ), vreinterpret_s32_s16( v8_m1 ) );
v16_trn0 = vzip_s16( vreinterpret_s16_s32( v16_trn1.val[ 0 ] ),vreinterpret_s16_s32( v16_trn1.val[ 1 ] ) );
v16_trn1 = vtrn_s32( vreinterpret_s32_s16( v16_trn0.val[ 0 ] ), vreinterpret_s32_s16( v16_trn0.val[ 0 ] ) );
v8_l02 = vreinterpret_s16_s32( v16_trn1.val[ 0 ] );
v8_l13 = vreinterpret_s16_s32( v16_trn1.val[ 1 ] );
v16_trn1 = vtrn_s32( vreinterpret_s32_s16( v16_trn0.val[ 1 ] ), vreinterpret_s32_s16( v16_trn0.val[ 1 ] ) );
v8_l46 = vreinterpret_s16_s32( v16_trn1.val[ 0 ] );
v8_l57 = vreinterpret_s16_s32( v16_trn1.val[ 1 ] );
v16_s0 = vmull_s16( v8_l02, v8_mt0 );
v16_s0 = vmlal_s16( v16_s0, v8_l46, v8_mt2 );
v16_s1 = vmull_s16( v8_l02, v8_mt1 );
v16_s1 = vmlal_s16( v16_s1, v8_l46, v8_mt3 );
v16_s2 = vmull_s16( v8_l13, v8_mt4 );
v16_s2 = vmlal_s16( v16_s2, v8_l57, v8_mt6 );
v16_s3 = vmull_s16( v8_l13, v8_mt5 );
v16_s3 = vmlal_s16( v16_s3, v8_l57, v8_mt7 );
v16_s0 = vpaddq_s32( v16_s0, v16_s1 );
v16_s1 = vpaddq_s32( v16_s2, v16_s3 );
v16_e = vaddq_s32( v16_s0, v16_s1 );
v16_o = vsubq_s32( v16_s0, v16_s1 );
v16_o = vcombine_s32( vrev64_s32( vget_high_s32( v16_o ) ), vrev64_s32( vget_low_s32( v16_o ) ) );
v16_row = vcombine_s16( vmovn_s32( vrshrq_n_s32( v16_e, RND2BITS ) ), vmovn_s32(vrshrq_n_s32( v16_o, RND2BITS ) ) );
vst1q_s16( pi16_dst + ( 8 * i_j ), v16_row );
}
}
int32_t y262_quant8x8_intra_fw_mpeg2_neon( int16_t *pi_coeffs, int32_t i_stride, uint16_t *pui16_qmat, uint16_t *pui16_bias )
{
int32_t i_y, i_x, i_qm, i_nz, i_intra_dc;
int16x8_t v16_zero, v16_nz, v16_2047, v16_m2048;
v16_nz = v16_zero = vmovq_n_s16( 0 );
v16_2047 = vmovq_n_s16( 2047 );
v16_m2048 = vmovq_n_s16( -2048 );
i_intra_dc = pi_coeffs[ 0 ];
pi_coeffs[ 0 ] = 0;
i_nz = 0;
for( i_y = 0; i_y < 8; i_y++ )
{
int16x8_t v16_co;
int16x8_t v16_mask;
uint16x8_t v16_qm, v16_bias, v16_cou;
uint32x4_t v16_res0, v16_res1;
v16_co = vld1q_s16( pi_coeffs + ( i_y * 8 ) );
v16_qm = vld1q_u16( pui16_qmat + ( i_y * 8 ) );
v16_bias = vld1q_u16( pui16_bias + ( i_y * 8 ) );
v16_mask = vreinterpretq_s16_u16( vcgtq_s16( v16_zero, v16_co ) );
v16_co = veorq_s16( v16_co, v16_mask );
v16_co = vsubq_s16( v16_co, v16_mask );
v16_cou = vaddq_u16( vreinterpretq_u16_s16( v16_co ), v16_bias );
v16_res0 = vmull_u16( vget_low_u16( v16_cou ), vget_low_u16( v16_qm ) );
v16_res1 = vmull_u16( vget_high_u16( v16_cou ), vget_high_u16( v16_qm ) );
v16_co = vreinterpretq_s16_u16( vcombine_u16( vshrn_n_u32( v16_res0, 16 ), vshrn_n_u32( v16_res1, 16 ) ) );
v16_co = veorq_s16( v16_co, v16_mask );
v16_co = vsubq_s16( v16_co, v16_mask );
v16_co = vminq_s16( v16_co, v16_2047 );
v16_co = vmaxq_s16( v16_co, v16_m2048 );
vst1q_s16( pi_coeffs + ( i_y * 8 ), v16_co );
v16_nz = vorrq_s16( v16_co, v16_nz );
}
v16_nz = vnegq_s16( vreinterpretq_s16_u16( vceqzq_s16( v16_nz ) ) );
i_nz = vgetq_lane_u32( vreinterpretq_u32_s16( v16_nz ), 0 );
i_nz |= vgetq_lane_u32( vreinterpretq_u32_s16( v16_nz ), 1 );
i_nz |= vgetq_lane_u32( vreinterpretq_u32_s16( v16_nz ), 2 );
i_nz |= vgetq_lane_u32( vreinterpretq_u32_s16( v16_nz ), 3 );
pi_coeffs[ 0 ] = i_intra_dc;
return i_nz;
}
int32_t y262_quant8x8_inter_fw_mpeg2_neon( int16_t *pi_coeffs, int32_t i_stride, uint16_t *pui16_qmat )
{
int32_t i_y, i_x, i_qm, i_nz;
int16x8_t v16_zero, v16_nz, v16_2047, v16_m2048;
v16_nz = v16_zero = vmovq_n_s16( 0 );
v16_2047 = vmovq_n_s16( 2047 );
v16_m2048 = vmovq_n_s16( -2048 );
i_nz = 0;
for( i_y = 0; i_y < 8; i_y++ )
{
int16x8_t v16_co;
int16x8_t v16_mask;
uint16x8_t v16_qm;
uint32x4_t v16_res0, v16_res1;
v16_co = vld1q_s16( pi_coeffs + ( i_y * 8 ) );
v16_qm = vld1q_u16( pui16_qmat + ( i_y * 8 ) );
v16_mask = vreinterpretq_s16_u16( vcgtq_s16( v16_zero, v16_co ) );
v16_co = veorq_s16( v16_co, v16_mask );
v16_co = vsubq_s16( v16_co, v16_mask );
v16_res0 = vmull_u16( vreinterpret_u16_s16( vget_low_s16( v16_co ) ), vget_low_u16( v16_qm ) );
v16_res1 = vmull_u16( vreinterpret_u16_s16( vget_high_s16( v16_co ) ), vget_high_u16( v16_qm ) );
v16_co = vreinterpretq_s16_u16( vcombine_u16( vshrn_n_u32( v16_res0, 16 ), vshrn_n_u32( v16_res1, 16 ) ) );
v16_co = veorq_s16( v16_co, v16_mask );
v16_co = vsubq_s16( v16_co, v16_mask );
v16_co = vminq_s16( v16_co, v16_2047 );
v16_co = vmaxq_s16( v16_co, v16_m2048 );
vst1q_s16( pi_coeffs + ( i_y * 8 ), v16_co );
v16_nz = vorrq_s16( v16_co, v16_nz );
}
v16_nz = vnegq_s16( vreinterpretq_s16_u16( vceqzq_s16( v16_nz ) ) );
i_nz = vgetq_lane_u32( vreinterpretq_u32_s16( v16_nz ), 0 );
i_nz |= vgetq_lane_u32( vreinterpretq_u32_s16( v16_nz ), 1 );
i_nz |= vgetq_lane_u32( vreinterpretq_u32_s16( v16_nz ), 2 );
i_nz |= vgetq_lane_u32( vreinterpretq_u32_s16( v16_nz ), 3 );
return i_nz;
}
//#endif

View file

@ -0,0 +1,36 @@
/*
Copyright (c) 2013, Ralf Willenbacher
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
*/
void y262_fdct_neon( int16_t *pi16_block, int16_t *pi16_dst );
void y262_idct_neon( int16_t *pi16_block, int16_t *pi16_dst );
int32_t y262_quant8x8_intra_fw_mpeg2_neon( int16_t *pi_coeffs, int32_t i_stride, uint16_t *pui16_qmat, uint16_t *pui16_bias );
int32_t y262_quant8x8_inter_fw_mpeg2_neon( int16_t *pi_coeffs, int32_t i_stride, uint16_t *pui16_qmat );

View file

@ -59,11 +59,19 @@ POSSIBILITY OF SUCH DAMAGE.
#include "transform.h"
#include "pixelop.h"
#include "me.h"
#include "transform_x86.h"
#include "pixelop_x86.h"
#include "ratectrl.h"
#include "threads.h"
#ifdef ASSEMBLY_X86
#include "transform_x86.h"
#include "pixelop_x86.h"
#endif
#ifdef ASSEMBLY_ARM64
#include "transform_arm64.h"
#include "pixelop_arm64.h"
#endif
void y262_init_motion_compensation( y262_t *ps_y262 );
void y262_error( y262_t *ps_y262, int32_t i_error_code, int8_t* pi8_format, ... );
void y262_encode_picture( y262_t *ps_y262, y262_picture_t *ps_picture, int32_t i_picture_type, int32_t i_pon );

View file

@ -548,6 +548,7 @@ int32_t y262_initialize( void *p_y262, y262_configuration_t *ps_config )
ps_y262->s_funcs.f_fdct_8x8 = y262_fdct_c;
ps_y262->s_funcs.f_idct_8x8 = y262_idct_c;
#ifdef ASSEMBLY_X86
if( 1 )
{
ps_y262->s_funcs.rgf_sad[ BLOCK_TYPE_16x16 ] = y262_sad_16x16_sse2;
@ -565,7 +566,29 @@ int32_t y262_initialize( void *p_y262, y262_configuration_t *ps_config )
ps_y262->s_funcs.f_fdct_8x8 = y262_fdct_sse2;
ps_y262->s_funcs.f_idct_8x8 = y262_idct_sse2;
}
#endif
#ifdef ASSEMBLY_ARM64
if( 1 )
{
ps_y262->s_funcs.rgf_sad[ BLOCK_TYPE_16x16 ] = y262_sad_16x16_neon;
ps_y262->s_funcs.rgf_sad[ BLOCK_TYPE_16x8 ] = y262_sad_16x8_neon;
ps_y262->s_funcs.rgf_satd[ BLOCK_TYPE_16x16 ] = y262_satd_16x16_neon;
ps_y262->s_funcs.rgf_satd[ BLOCK_TYPE_16x8 ] = y262_satd_16x8_neon;
ps_y262->s_funcs.f_ssd_16x16 = y262_ssd_16x16_neon;
ps_y262->s_funcs.f_ssd_8x8 = y262_ssd_8x8_neon;
ps_y262->s_funcs.f_add_8x8 = y262_add_8x8_neon;
ps_y262->s_funcs.f_sub_8x8 = y262_sub_8x8_neon;
ps_y262->s_funcs.f_quant8x8_intra_fw = y262_quant8x8_intra_fw_mpeg2_neon;
ps_y262->s_funcs.f_quant8x8_inter_fw = y262_quant8x8_inter_fw_mpeg2_neon;
ps_y262->s_funcs.f_fdct_8x8 = y262_fdct_neon;
ps_y262->s_funcs.f_idct_8x8 = y262_idct_neon;
}
#endif
memset( ps_y262->rgi_y262_motion_bits_x, 0, sizeof( ps_y262->rgi_y262_motion_bits_x ) );
memset( ps_y262->rgi_y262_motion_bits_y, 1, sizeof( ps_y262->rgi_y262_motion_bits_y ) );

View file

@ -1,12 +1,6 @@
cmake_minimum_required(VERSION 3.1)
project(y262app)
if(CMAKE_SIZEOF_VOID_P EQUAL 8)
set(ARCH "_x64")
else()
set(ARCH "_x86")
endif()
set( SRC_FILES
${CMAKE_CURRENT_SOURCE_DIR}/main.c
)
@ -14,7 +8,7 @@ set( SRC_FILES
add_executable(y262app ${SRC_FILES})
target_link_libraries(y262app liby262)
set_target_properties(y262app PROPERTIES
OUTPUT_NAME "y262$<$<CONFIG:Debug>:d>${ARCH}"
OUTPUT_NAME "y262$<$<CONFIG:Debug>:d>"
RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/bin"
POSITION_INDEPENDENT_CODE ON
)