diff --git a/src/y262/CMakeLists.txt b/src/y262/CMakeLists.txt index 7d697f8..4d2b259 100644 --- a/src/y262/CMakeLists.txt +++ b/src/y262/CMakeLists.txt @@ -4,32 +4,45 @@ project(liby262) find_package(Threads) find_program(YASM_EXE NAMES yasm) -if(CMAKE_SIZEOF_VOID_P EQUAL 8) - set(ARCH "_x64") - if(WIN32) - set(YASM_ARGS -f win32 -m amd64 -DARCH_X86_64 -DPIC) - elseif(APPLE) - set(YASM_ARGS -f macho64 -m amd64 -DARCH_X86_64 -DPIC --prefix=_) - else() - set(YASM_ARGS -f elf64 -m amd64 -DARCH_X86_64 -DPIC) - endif() -else() - set(ARCH "_x86") - if(WIN32) - set(YASM_ARGS -f win32 --prefix=_) - elseif(APPLE) - set(YASM_ARGS -f macho32 --prefix=_) - else() - set(YASM_ARGS -f elf32) - endif() +message( "architecture: ${CMAKE_SYSTEM_PROCESSOR}") + +set(Y262_TARGET_ARCH "unknown") +if(CMAKE_SYSTEM_PROCESSOR MATCHES "amd64.*|x86_64.*|AMD64.*") + set(Y262_TARGET_ARCH "intelx86") +elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "i686.*|x86.*|i386.*") + set(Y262_TARGET_ARCH "intelx86") +elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(arm64.*|aarch64.*|ARM64.*|AARCH64.*)") + set(Y262_TARGET_ARCH "arm64") endif() -add_custom_command(OUTPUT pixelop_x86.o COMMAND ${YASM_EXE} - ARGS ${YASM_ARGS} -o ${CMAKE_CURRENT_BINARY_DIR}/pixelop_x86.o ${CMAKE_CURRENT_SOURCE_DIR}/pixelop_x86.asm) -add_custom_command(OUTPUT transform_x86.o COMMAND ${YASM_EXE} - ARGS ${YASM_ARGS} -o ${CMAKE_CURRENT_BINARY_DIR}/transform_x86.o ${CMAKE_CURRENT_SOURCE_DIR}/transform_x86.asm) +message( "target_arch: ${Y262_TARGET_ARCH}") -add_library(liby262 STATIC +if(Y262_TARGET_ARCH MATCHES "intelx86") + if(CMAKE_SIZEOF_VOID_P EQUAL 8) + if(WIN32) + set(YASM_ARGS -f win32 -m amd64 -DARCH_X86_64 -DPIC) + elseif(APPLE) + set(YASM_ARGS -f macho64 -m amd64 -DARCH_X86_64 -DPIC --prefix=_) + else() + set(YASM_ARGS -f elf64 -m amd64 -DARCH_X86_64 -DPIC) + endif() + else() + if(WIN32) + set(YASM_ARGS -f win32 --prefix=_) + elseif(APPLE) + set(YASM_ARGS -f macho32 --prefix=_) + else() + set(YASM_ARGS -f elf32) + endif() + endif() + + add_custom_command(OUTPUT pixelop_x86.o COMMAND ${YASM_EXE} + ARGS ${YASM_ARGS} -o ${CMAKE_CURRENT_BINARY_DIR}/pixelop_x86.o ${CMAKE_CURRENT_SOURCE_DIR}/pixelop_x86.asm) + add_custom_command(OUTPUT transform_x86.o COMMAND ${YASM_EXE} + ARGS ${YASM_ARGS} -o ${CMAKE_CURRENT_BINARY_DIR}/transform_x86.o ${CMAKE_CURRENT_SOURCE_DIR}/transform_x86.asm) +endif() + +set(liby262_sources_basic ${CMAKE_CURRENT_SOURCE_DIR}/aboveslicelevel.h ${CMAKE_CURRENT_SOURCE_DIR}/bitstream.h ${CMAKE_CURRENT_SOURCE_DIR}/lookahead.h @@ -56,17 +69,36 @@ add_library(liby262 STATIC ${CMAKE_CURRENT_SOURCE_DIR}/transform.c ${CMAKE_CURRENT_SOURCE_DIR}/y262.c ${CMAKE_CURRENT_SOURCE_DIR}/y262api.c - - ${CMAKE_CURRENT_BINARY_DIR}/pixelop_x86.o - ${CMAKE_CURRENT_BINARY_DIR}/transform_x86.o ) -set_target_properties(liby262 PROPERTIES - OUTPUT_NAME "liby262$<$:d>${ARCH}" - ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib" - LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib" +set(liby262_sources_assembly "") + +if(Y262_TARGET_ARCH MATCHES "intelx86") + set(liby262_sources_assembly + ${CMAKE_CURRENT_BINARY_DIR}/pixelop_x86.o + ${CMAKE_CURRENT_BINARY_DIR}/transform_x86.o + ) +elseif(Y262_TARGET_ARCH MATCHES "arm64") + set(liby262_sources_assembly + ${CMAKE_CURRENT_SOURCE_DIR}/transform_arm64.c + ${CMAKE_CURRENT_SOURCE_DIR}/transform_arm64.h + ${CMAKE_CURRENT_SOURCE_DIR}/pixelop_arm64.c + ${CMAKE_CURRENT_SOURCE_DIR}/pixelop_arm64.h + ) +endif() + +add_library(liby262 STATIC + ${liby262_sources_basic} + ${liby262_sources_assembly} ) - + +target_include_directories(liby262 PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}) + +if(Y262_TARGET_ARCH MATCHES "intelx86") + add_compile_definitions(ASSEMBLY_X86) +elseif(Y262_TARGET_ARCH MATCHES "arm64") + add_compile_definitions(ASSEMBLY_ARM64) +endif() if(WIN32) target_compile_definitions(liby262 PRIVATE WIN32) @@ -77,8 +109,12 @@ else() target_link_libraries(liby262 PUBLIC m) endif() -set_target_properties(liby262 PROPERTIES POSITION_INDEPENDENT_CODE ON) +set_target_properties(liby262 PROPERTIES + OUTPUT_NAME "liby262$<$:d>${MY_ARCH}" + ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib" + LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib" + POSITION_INDEPENDENT_CODE ON +) -target_include_directories(liby262 PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}) target_link_libraries(liby262 PUBLIC Threads::Threads) diff --git a/src/y262/mc.c b/src/y262/mc.c index 7719a02..515c594 100644 --- a/src/y262/mc.c +++ b/src/y262/mc.c @@ -30,6 +30,8 @@ POSSIBILITY OF SUCH DAMAGE. #include "y262.h" +#ifdef ASSEMBLY_X86 + void y262_motcomp_16x16_00_put_sse2( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride ); void y262_motcomp_16x16_01_put_sse2( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride ); void y262_motcomp_16x16_10_put_sse2( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride ); @@ -80,6 +82,65 @@ void y262_motcomp_8x4_01_avg_mmxext( uint8_t *pui8_src, int32_t i_src_stride, ui void y262_motcomp_8x4_10_avg_mmxext( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride ); void y262_motcomp_8x4_11_avg_mmxext( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride ); +#endif + + +#ifdef ASSEMBLY_ARM64 + +void y262_motcomp_16x16_00_put_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride ); +void y262_motcomp_16x16_01_put_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride ); +void y262_motcomp_16x16_10_put_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride ); +void y262_motcomp_16x16_11_put_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride ); + +void y262_motcomp_16x8_00_put_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride ); +void y262_motcomp_16x8_01_put_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride ); +void y262_motcomp_16x8_10_put_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride ); +void y262_motcomp_16x8_11_put_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride ); + +void y262_motcomp_8x16_00_put_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride ); +void y262_motcomp_8x16_01_put_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride ); +void y262_motcomp_8x16_10_put_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride ); +void y262_motcomp_8x16_11_put_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride ); + +void y262_motcomp_8x8_00_put_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride ); +void y262_motcomp_8x8_01_put_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride ); +void y262_motcomp_8x8_10_put_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride ); +void y262_motcomp_8x8_11_put_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride ); + +void y262_motcomp_8x4_00_put_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride ); +void y262_motcomp_8x4_01_put_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride ); +void y262_motcomp_8x4_10_put_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride ); +void y262_motcomp_8x4_11_put_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride ); + +void y262_motcomp_16x16_00_avg_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride ); +void y262_motcomp_16x16_01_avg_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride ); +void y262_motcomp_16x16_10_avg_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride ); +void y262_motcomp_16x16_11_avg_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride ); + +void y262_motcomp_16x8_00_avg_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride ); +void y262_motcomp_16x8_01_avg_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride ); +void y262_motcomp_16x8_10_avg_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride ); +void y262_motcomp_16x8_11_avg_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride ); + +void y262_motcomp_8x16_00_avg_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride ); +void y262_motcomp_8x16_01_avg_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride ); +void y262_motcomp_8x16_10_avg_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride ); +void y262_motcomp_8x16_11_avg_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride ); + +void y262_motcomp_8x8_00_avg_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride ); +void y262_motcomp_8x8_01_avg_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride ); +void y262_motcomp_8x8_10_avg_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride ); +void y262_motcomp_8x8_11_avg_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride ); + +void y262_motcomp_8x4_00_avg_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride ); +void y262_motcomp_8x4_01_avg_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride ); +void y262_motcomp_8x4_10_avg_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride ); +void y262_motcomp_8x4_11_avg_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride ); + +#endif + + + @@ -322,7 +383,7 @@ void y262_init_motion_compensation( y262_t *ps_y262 ) ps_y262->s_funcs.rgf_motcomp_avg[ MC_BLOCK_8x4 ][ MC_BLOCK_10 ] = y262_motcomp_8x4_10_avg; ps_y262->s_funcs.rgf_motcomp_avg[ MC_BLOCK_8x4 ][ MC_BLOCK_11 ] = y262_motcomp_8x4_11_avg; -#if 1 +#ifdef ASSEMBLY_X86 if( 1 ) { @@ -382,6 +443,70 @@ void y262_init_motion_compensation( y262_t *ps_y262 ) ps_y262->s_funcs.rgf_motcomp_avg[ MC_BLOCK_8x4 ][ MC_BLOCK_11 ] = y262_motcomp_8x4_11_avg_mmxext; } #endif + +#ifdef ASSEMBLY_ARM64 + + if( 1 ) + { + /* copy */ + ps_y262->s_funcs.rgf_motcomp_copy[ MC_BLOCK_16x16 ][ MC_BLOCK_00 ] = y262_motcomp_16x16_00_put_neon; + ps_y262->s_funcs.rgf_motcomp_copy[ MC_BLOCK_16x16 ][ MC_BLOCK_01 ] = y262_motcomp_16x16_01_put_neon; + ps_y262->s_funcs.rgf_motcomp_copy[ MC_BLOCK_16x16 ][ MC_BLOCK_10 ] = y262_motcomp_16x16_10_put_neon; + ps_y262->s_funcs.rgf_motcomp_copy[ MC_BLOCK_16x16 ][ MC_BLOCK_11 ] = y262_motcomp_16x16_11_put_neon; + + ps_y262->s_funcs.rgf_motcomp_copy[ MC_BLOCK_16x8 ][ MC_BLOCK_00 ] = y262_motcomp_16x8_00_put_neon; + ps_y262->s_funcs.rgf_motcomp_copy[ MC_BLOCK_16x8 ][ MC_BLOCK_01 ] = y262_motcomp_16x8_01_put_neon; + ps_y262->s_funcs.rgf_motcomp_copy[ MC_BLOCK_16x8 ][ MC_BLOCK_10 ] = y262_motcomp_16x8_10_put_neon; + ps_y262->s_funcs.rgf_motcomp_copy[ MC_BLOCK_16x8 ][ MC_BLOCK_11 ] = y262_motcomp_16x8_11_put_neon; + + ps_y262->s_funcs.rgf_motcomp_copy[ MC_BLOCK_8x16 ][ MC_BLOCK_00 ] = y262_motcomp_8x16_00_put_neon; + ps_y262->s_funcs.rgf_motcomp_copy[ MC_BLOCK_8x16 ][ MC_BLOCK_01 ] = y262_motcomp_8x16_01_put_neon; + ps_y262->s_funcs.rgf_motcomp_copy[ MC_BLOCK_8x16 ][ MC_BLOCK_10 ] = y262_motcomp_8x16_10_put_neon; + ps_y262->s_funcs.rgf_motcomp_copy[ MC_BLOCK_8x16 ][ MC_BLOCK_11 ] = y262_motcomp_8x16_11_put_neon; + + ps_y262->s_funcs.rgf_motcomp_copy[ MC_BLOCK_8x8 ][ MC_BLOCK_00 ] = y262_motcomp_8x8_00_put_neon; + ps_y262->s_funcs.rgf_motcomp_copy[ MC_BLOCK_8x8 ][ MC_BLOCK_01 ] = y262_motcomp_8x8_01_put_neon; + ps_y262->s_funcs.rgf_motcomp_copy[ MC_BLOCK_8x8 ][ MC_BLOCK_10 ] = y262_motcomp_8x8_10_put_neon; + ps_y262->s_funcs.rgf_motcomp_copy[ MC_BLOCK_8x8 ][ MC_BLOCK_11 ] = y262_motcomp_8x8_11_put_neon; + + ps_y262->s_funcs.rgf_motcomp_copy[ MC_BLOCK_8x4 ][ MC_BLOCK_00 ] = y262_motcomp_8x4_00_put_neon; + ps_y262->s_funcs.rgf_motcomp_copy[ MC_BLOCK_8x4 ][ MC_BLOCK_01 ] = y262_motcomp_8x4_01_put_neon; + ps_y262->s_funcs.rgf_motcomp_copy[ MC_BLOCK_8x4 ][ MC_BLOCK_10 ] = y262_motcomp_8x4_10_put_neon; + ps_y262->s_funcs.rgf_motcomp_copy[ MC_BLOCK_8x4 ][ MC_BLOCK_11 ] = y262_motcomp_8x4_11_put_neon; + + + + /* avg */ + ps_y262->s_funcs.rgf_motcomp_avg[ MC_BLOCK_16x16 ][ MC_BLOCK_00 ] = y262_motcomp_16x16_00_avg_neon; + ps_y262->s_funcs.rgf_motcomp_avg[ MC_BLOCK_16x16 ][ MC_BLOCK_01 ] = y262_motcomp_16x16_01_avg_neon; + ps_y262->s_funcs.rgf_motcomp_avg[ MC_BLOCK_16x16 ][ MC_BLOCK_10 ] = y262_motcomp_16x16_10_avg_neon; + ps_y262->s_funcs.rgf_motcomp_avg[ MC_BLOCK_16x16 ][ MC_BLOCK_11 ] = y262_motcomp_16x16_11_avg_neon; + + ps_y262->s_funcs.rgf_motcomp_avg[ MC_BLOCK_16x8 ][ MC_BLOCK_00 ] = y262_motcomp_16x8_00_avg_neon; + ps_y262->s_funcs.rgf_motcomp_avg[ MC_BLOCK_16x8 ][ MC_BLOCK_01 ] = y262_motcomp_16x8_01_avg_neon; + ps_y262->s_funcs.rgf_motcomp_avg[ MC_BLOCK_16x8 ][ MC_BLOCK_10 ] = y262_motcomp_16x8_10_avg_neon; + ps_y262->s_funcs.rgf_motcomp_avg[ MC_BLOCK_16x8 ][ MC_BLOCK_11 ] = y262_motcomp_16x8_11_avg_neon; + + ps_y262->s_funcs.rgf_motcomp_avg[ MC_BLOCK_8x16 ][ MC_BLOCK_00 ] = y262_motcomp_8x16_00_avg_neon; + ps_y262->s_funcs.rgf_motcomp_avg[ MC_BLOCK_8x16 ][ MC_BLOCK_01 ] = y262_motcomp_8x16_01_avg_neon; + ps_y262->s_funcs.rgf_motcomp_avg[ MC_BLOCK_8x16 ][ MC_BLOCK_10 ] = y262_motcomp_8x16_10_avg_neon; + ps_y262->s_funcs.rgf_motcomp_avg[ MC_BLOCK_8x16 ][ MC_BLOCK_11 ] = y262_motcomp_8x16_11_avg_neon; + + ps_y262->s_funcs.rgf_motcomp_avg[ MC_BLOCK_8x8 ][ MC_BLOCK_00 ] = y262_motcomp_8x8_00_avg_neon; + ps_y262->s_funcs.rgf_motcomp_avg[ MC_BLOCK_8x8 ][ MC_BLOCK_01 ] = y262_motcomp_8x8_01_avg_neon; + ps_y262->s_funcs.rgf_motcomp_avg[ MC_BLOCK_8x8 ][ MC_BLOCK_10 ] = y262_motcomp_8x8_10_avg_neon; + ps_y262->s_funcs.rgf_motcomp_avg[ MC_BLOCK_8x8 ][ MC_BLOCK_11 ] = y262_motcomp_8x8_11_avg_neon; + + ps_y262->s_funcs.rgf_motcomp_avg[ MC_BLOCK_8x4 ][ MC_BLOCK_00 ] = y262_motcomp_8x4_00_avg_neon; + ps_y262->s_funcs.rgf_motcomp_avg[ MC_BLOCK_8x4 ][ MC_BLOCK_01 ] = y262_motcomp_8x4_01_avg_neon; + ps_y262->s_funcs.rgf_motcomp_avg[ MC_BLOCK_8x4 ][ MC_BLOCK_10 ] = y262_motcomp_8x4_10_avg_neon; + ps_y262->s_funcs.rgf_motcomp_avg[ MC_BLOCK_8x4 ][ MC_BLOCK_11 ] = y262_motcomp_8x4_11_avg_neon; + + } +#endif + + + } diff --git a/src/y262/pixelop.c b/src/y262/pixelop.c index 83b547c..cbb32bd 100644 --- a/src/y262/pixelop.c +++ b/src/y262/pixelop.c @@ -226,6 +226,8 @@ int32_t y262_satd_16x8( uint8_t *pui8_blk1, int32_t i_stride1, uint8_t *pui8_blk return i_satd; } +#ifdef ASSEMBLY_X86 + int32_t y262_satd_16x16_sse2( uint8_t *pui8_blk1, int32_t i_stride1, uint8_t *pui8_blk2, int32_t i_stride2 ) { int32_t i_satd; @@ -249,6 +251,7 @@ int32_t y262_satd_16x8_sse2( uint8_t *pui8_blk1, int32_t i_stride1, uint8_t *pui return i_satd; } +#endif int32_t y262_ssd_16x16( uint8_t *pui8_blk1, int32_t i_blk1_stride, uint8_t *pui8_blk2, int32_t i_blk2_stride ) { diff --git a/src/y262/pixelop_arm64.c b/src/y262/pixelop_arm64.c new file mode 100644 index 0000000..456993a --- /dev/null +++ b/src/y262/pixelop_arm64.c @@ -0,0 +1,773 @@ +/* +Copyright (c) 2013, Ralf Willenbacher +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +*/ + +#include + +#include "y262.h" + +int32_t y262_sad_16x8_neon( uint8_t *pui8_blk1, int32_t i_stride1, uint8_t *pui8_blk2, int32_t i_stride2 ) +{ + int32_t i_sad, i_y; + int64_t i64_sad; + uint8x16_t v16_blk1, v16_blk2; + uint8x8_t v8_a, v8_b; + uint16x8_t v16_res0, v16_res1; + uint32x4_t v16_hadd0; + uint32x2_t v8_hadd1; + uint64x1_t v8_hadd2; + + v16_blk1 = vld1q_u8 ( pui8_blk1 ); + pui8_blk1 += i_stride1; + v16_blk2 = vld1q_u8 ( pui8_blk2 ); + pui8_blk2 += i_stride2; + v8_a = vget_low_u8( v16_blk1 ); + v8_b = vget_low_u8( v16_blk2 ); + v16_res0 = vabdl_u8 ( v8_a, v8_b ); + v8_a = vget_high_u8( v16_blk1 ); + v8_b = vget_high_u8( v16_blk2 ); + v16_res1 = vabdl_u8 ( v8_a, v8_b ); + + for( i_y = 1; i_y < 8; i_y++ ) + { + v16_blk1 = vld1q_u8 ( pui8_blk1 ); + pui8_blk1 += i_stride1; + v16_blk2 = vld1q_u8 ( pui8_blk2 ); + pui8_blk2 += i_stride2; + v8_a = vget_low_u8( v16_blk1 ); + v8_b = vget_low_u8( v16_blk2 ); + v16_res0 = vabal_u8 ( v16_res0, v8_a, v8_b ); + v8_a = vget_high_u8( v16_blk1 ); + v8_b = vget_high_u8( v16_blk2 ); + v16_res1 = vabal_u8 ( v16_res1, v8_a, v8_b ); + } + + v16_res0 = vaddq_u16( v16_res0, v16_res1 ); + v16_hadd0 = vpaddlq_u16( v16_res0 ); + v8_hadd1 = vadd_u32( vget_low_u32( v16_hadd0 ), vget_high_u32( v16_hadd0 ) ); + v8_hadd2 = vpaddl_u32( v8_hadd1 ); + + i64_sad = vget_lane_u64( v8_hadd2, 0 ); + i_sad = ( int32_t )i64_sad; + + return i_sad; +} + + +int32_t y262_sad_16x16_neon( uint8_t *pui8_blk1, int32_t i_stride1, uint8_t *pui8_blk2, int32_t i_stride2 ) +{ + int32_t i_sad, i_y; + int64_t i64_sad; + uint8x16_t v16_blk1, v16_blk2; + uint8x8_t v8_a, v8_b; + uint16x8_t v16_res0, v16_res1; + uint32x4_t v16_hadd0; + uint32x2_t v8_hadd1; + uint64x1_t v8_hadd2; + + v16_blk1 = vld1q_u8 ( pui8_blk1 ); + pui8_blk1 += i_stride1; + v16_blk2 = vld1q_u8 ( pui8_blk2 ); + pui8_blk2 += i_stride2; + v8_a = vget_low_u8( v16_blk1 ); + v8_b = vget_low_u8( v16_blk2 ); + v16_res0 = vabdl_u8 ( v8_a, v8_b ); + v8_a = vget_high_u8( v16_blk1 ); + v8_b = vget_high_u8( v16_blk2 ); + v16_res1 = vabdl_u8 ( v8_a, v8_b ); + + for( i_y = 1; i_y < 16; i_y++ ) + { + v16_blk1 = vld1q_u8 ( pui8_blk1 ); + pui8_blk1 += i_stride1; + v16_blk2 = vld1q_u8 ( pui8_blk2 ); + pui8_blk2 += i_stride2; + v8_a = vget_low_u8( v16_blk1 ); + v8_b = vget_low_u8( v16_blk2 ); + v16_res0 = vabal_u8 ( v16_res0, v8_a, v8_b ); + v8_a = vget_high_u8( v16_blk1 ); + v8_b = vget_high_u8( v16_blk2 ); + v16_res1 = vabal_u8 ( v16_res1, v8_a, v8_b ); + } + + v16_res0 = vaddq_u16( v16_res0, v16_res1 ); + v16_hadd0 = vpaddlq_u16( v16_res0 ); + v8_hadd1 = vadd_u32( vget_low_u32( v16_hadd0 ), vget_high_u32( v16_hadd0 ) ); + v8_hadd2 = vpaddl_u32( v8_hadd1 ); + + i64_sad = vget_lane_u64( v8_hadd2, 0 ); + i_sad = ( int32_t )i64_sad; + + return i_sad; +} + + +#define HADAMARD_NEON_4x2( d0, d1, d2, d3 ) \ + { \ + int32x4x2_t v32_a0, v32_b0; \ + int16x8x2_t v32_a1, v32_b1; \ + \ + d0 = vaddq_s16( d0, d1 ); \ + d2 = vaddq_s16( d2, d3 ); \ + \ + d1 = vaddq_s16( d1, d1 ); \ + d3 = vaddq_s16( d3, d3 ); \ + d1 = vsubq_s16( d1, d0 ); \ + d3 = vsubq_s16( d3, d2 ); \ + \ + d0 = vaddq_s16( d0, d2 ); \ + d1 = vaddq_s16( d1, d3 ); \ + \ + d2 = vaddq_s16( d2, d2 ); \ + d3 = vaddq_s16( d3, d3 ); \ + d2 = vsubq_s16( d2, d0 ); \ + d3 = vsubq_s16( d3, d1 ); \ + \ + v32_a0 = vtrnq_s32( vreinterpretq_s32_s16( d0 ), vreinterpretq_s32_s16( d2 ) ); \ + v32_b0 = vtrnq_s32( vreinterpretq_s32_s16( d1 ), vreinterpretq_s32_s16( d3 ) ); \ + v32_a1 = vtrnq_s16( vreinterpretq_s16_s32( v32_a0.val[ 0 ] ), vreinterpretq_s16_s32( v32_b0.val[ 0 ] ) ); \ + v32_b1 = vtrnq_s16( vreinterpretq_s16_s32( v32_a0.val[ 1 ] ), vreinterpretq_s16_s32( v32_b0.val[ 1 ] ) ); \ + d0 = vcombine_s16( vget_low_s16( v32_a1.val[ 0 ] ), vget_high_s16( v32_a1.val[ 0 ] ) ); \ + d1 = vcombine_s16( vget_low_s16( v32_a1.val[ 1 ] ), vget_high_s16( v32_a1.val[ 1 ] ) ); \ + d2 = vcombine_s16( vget_low_s16( v32_b1.val[ 1 ] ), vget_high_s16( v32_b1.val[ 1 ] ) ); \ + d3 = vcombine_s16( vget_low_s16( v32_b1.val[ 0 ] ), vget_high_s16( v32_b1.val[ 0 ] ) ); \ + \ + d0 = vaddq_s16( d0, d1 ); \ + d2 = vaddq_s16( d2, d3 ); \ + \ + d1 = vaddq_s16( d1, d1 ); \ + d3 = vaddq_s16( d3, d3 ); \ + d1 = vsubq_s16( d1, d0 ); \ + d3 = vsubq_s16( d3, d2 ); \ + \ + d0 = vaddq_s16( d0, d2 ); \ + d1 = vaddq_s16( d1, d3 ); \ + \ + d2 = vaddq_s16( d2, d2 ); \ + d3 = vaddq_s16( d3, d3 ); \ + d2 = vsubq_s16( d2, d0 ); \ + d3 = vsubq_s16( d3, d1 ); \ + } + +#define ADDSUB( d0, d1 ) \ + { \ + int16x8_t v16_a, v16_s; \ + v16_a = vaddq_s16( d0, d1 ); \ + v16_s = vsubq_s16( d1, d0 ); \ + d0 = v16_a; \ + d1 = v16_s; \ + } + + +#define ABSADDL( sum, vector0, vector1 ) \ + { \ + vector0 = vabsq_s16( vector0 ); \ + vector1 = vabsq_s16( vector1 ); \ + sum = vaddq_s32( sum, vaddl_s16( vget_low_s16( vector0 ), vget_high_s16( vector0 ) ) ); \ + sum = vaddq_s32( sum, vaddl_s16( vget_low_s16( vector1 ), vget_high_s16( vector1 ) ) ); \ + } + +int32_t y262_satd_8x8_neon( uint8_t *pui8_blk1, int32_t i_stride1, uint8_t *pui8_blk2, int32_t i_stride2 ) +{ + int32_t i_satd; + int64_t i64_satd; + int16x8_t v16_d0, v16_d1, v16_d2, v16_d3, v16_d4, v16_d5, v16_d6, v16_d7; + int16x8_t v16_a0, v16_a1, v16_a2, v16_a3; + int32x4_t v16_res; + int32x2_t v8_hadd0; + int64x1_t v8_hadd1; + + v16_res = vmovq_n_s32( 0 ); + + v16_d0 = vreinterpretq_s16_u16( vsubl_u8( vld1_u8( pui8_blk1 + i_stride1 * 0 ), vld1_u8( pui8_blk2 + i_stride2 * 0 ) ) ); + v16_d1 = vreinterpretq_s16_u16( vsubl_u8( vld1_u8( pui8_blk1 + i_stride1 * 1 ), vld1_u8( pui8_blk2 + i_stride2 * 1 ) ) ); + v16_d2 = vreinterpretq_s16_u16( vsubl_u8( vld1_u8( pui8_blk1 + i_stride1 * 2 ), vld1_u8( pui8_blk2 + i_stride2 * 2 ) ) ); + v16_d3 = vreinterpretq_s16_u16( vsubl_u8( vld1_u8( pui8_blk1 + i_stride1 * 3 ), vld1_u8( pui8_blk2 + i_stride2 * 3 ) ) ); + v16_d4 = vreinterpretq_s16_u16( vsubl_u8( vld1_u8( pui8_blk1 + i_stride1 * 4 ), vld1_u8( pui8_blk2 + i_stride2 * 4 ) ) ); + v16_d5 = vreinterpretq_s16_u16( vsubl_u8( vld1_u8( pui8_blk1 + i_stride1 * 5 ), vld1_u8( pui8_blk2 + i_stride2 * 5 ) ) ); + v16_d6 = vreinterpretq_s16_u16( vsubl_u8( vld1_u8( pui8_blk1 + i_stride1 * 6 ), vld1_u8( pui8_blk2 + i_stride2 * 6 ) ) ); + v16_d7 = vreinterpretq_s16_u16( vsubl_u8( vld1_u8( pui8_blk1 + i_stride1 * 7 ), vld1_u8( pui8_blk2 + i_stride2 * 7 ) ) ); + + HADAMARD_NEON_4x2( v16_d0, v16_d1, v16_d2, v16_d3 ); + HADAMARD_NEON_4x2( v16_d4, v16_d5, v16_d6, v16_d7 ); + + ADDSUB( v16_d0, v16_d4 ); + ADDSUB( v16_d1, v16_d5 ); + ADDSUB( v16_d2, v16_d6 ); + ADDSUB( v16_d3, v16_d7 ); + + v16_a0 = vcombine_s16( vget_low_s16( v16_d0 ), vget_low_s16( v16_d4 ) ); + v16_a1 = vcombine_s16( vget_high_s16( v16_d0 ), vget_high_s16( v16_d4 ) ); + ADDSUB( v16_a0, v16_a1 ); + ABSADDL( v16_res, v16_a0, v16_a1 ); + + v16_a0 = vcombine_s16( vget_low_s16( v16_d1 ), vget_low_s16( v16_d5 ) ); + v16_a1 = vcombine_s16( vget_high_s16( v16_d1 ), vget_high_s16( v16_d5 ) ); + ADDSUB( v16_a0, v16_a1 ); + ABSADDL( v16_res, v16_a0, v16_a1 ); + + v16_a0 = vcombine_s16( vget_low_s16( v16_d2 ), vget_low_s16( v16_d6 ) ); + v16_a1 = vcombine_s16( vget_high_s16( v16_d2 ), vget_high_s16( v16_d6 ) ); + ADDSUB( v16_a0, v16_a1 ); + ABSADDL( v16_res, v16_a0, v16_a1 ); + + v16_a0 = vcombine_s16( vget_low_s16( v16_d3 ), vget_low_s16( v16_d7 ) ); + v16_a1 = vcombine_s16( vget_high_s16( v16_d3 ), vget_high_s16( v16_d7 ) ); + ADDSUB( v16_a0, v16_a1 ); + ABSADDL( v16_res, v16_a0, v16_a1 ); + + v8_hadd0 = vadd_s32( vget_low_s32( v16_res ), vget_high_s32( v16_res ) ); + v8_hadd1 = vpaddl_s32( v8_hadd0 ); + + i64_satd = vget_lane_s64( v8_hadd1, 0 ); + i_satd = ( int32_t )i64_satd; + + i_satd = ( i_satd + 2 ) >> 2; + + return i_satd; +} + + +int32_t y262_satd_16x16_neon( uint8_t *pui8_blk1, int32_t i_stride1, uint8_t *pui8_blk2, int32_t i_stride2 ) +{ + int32_t i_satd; + + i_satd = y262_satd_8x8_neon( pui8_blk1, i_stride1, pui8_blk2, i_stride2 ); + i_satd += y262_satd_8x8_neon( pui8_blk1 + 8, i_stride1, pui8_blk2 + 8, i_stride2 ); + i_satd += y262_satd_8x8_neon( pui8_blk1 + ( 8 * i_stride1 ), i_stride1, pui8_blk2 + ( 8 * i_stride2 ), i_stride2 ); + i_satd += y262_satd_8x8_neon( pui8_blk1 + 8 + ( 8 * i_stride1 ), i_stride1, pui8_blk2 + 8 + ( 8 * i_stride2 ), i_stride2 ); + + return i_satd; +} + + +int32_t y262_satd_16x8_neon( uint8_t *pui8_blk1, int32_t i_stride1, uint8_t *pui8_blk2, int32_t i_stride2 ) +{ + int32_t i_satd; + + i_satd = y262_satd_8x8_neon( pui8_blk1, i_stride1, pui8_blk2, i_stride2 ); + i_satd += y262_satd_8x8_neon( pui8_blk1 + 8, i_stride1, pui8_blk2 + 8, i_stride2 ); + + return i_satd; +} + + + +int32_t y262_ssd_8x8_neon( uint8_t *pui8_blk1, int32_t i_blk1_stride, uint8_t *pui8_blk2, int32_t i_blk2_stride ) +{ + int32_t i_ssd; + int16x8_t v16_d0, v16_d1, v16_d2, v16_d3; + int32x4_t v16_ssd0, v16_ssd1; + int32x2_t v8_hadd0; + int64x1_t v8_hadd1; + + v16_d0 = vreinterpretq_s16_u16( vsubl_u8( vld1_u8( pui8_blk1 + ( 0 * i_blk1_stride ) ), vld1_u8( pui8_blk2 + ( 0 * i_blk2_stride ) ) ) ); + v16_d1 = vreinterpretq_s16_u16( vsubl_u8( vld1_u8( pui8_blk1 + ( 1 * i_blk1_stride ) ), vld1_u8( pui8_blk2 + ( 1 * i_blk2_stride ) ) ) ); + v16_d2 = vreinterpretq_s16_u16( vsubl_u8( vld1_u8( pui8_blk1 + ( 2 * i_blk1_stride ) ), vld1_u8( pui8_blk2 + ( 2 * i_blk2_stride ) ) ) ); + v16_d3 = vreinterpretq_s16_u16( vsubl_u8( vld1_u8( pui8_blk1 + ( 3 * i_blk1_stride ) ), vld1_u8( pui8_blk2 + ( 3 * i_blk2_stride ) ) ) ); + + v16_ssd0 = vmull_s16( vget_low_s16( v16_d0 ), vget_low_s16( v16_d0 ) ); + v16_ssd1 = vmull_s16( vget_high_s16( v16_d0 ), vget_high_s16( v16_d0 ) ); + + v16_ssd0 = vmlal_s16( v16_ssd0, vget_low_s16( v16_d1 ), vget_low_s16( v16_d1 ) ); + v16_ssd1 = vmlal_s16( v16_ssd1, vget_high_s16( v16_d1 ), vget_high_s16( v16_d1 ) ); + + v16_ssd0 = vmlal_s16( v16_ssd0, vget_low_s16( v16_d2 ), vget_low_s16( v16_d2 ) ); + v16_ssd1 = vmlal_s16( v16_ssd1, vget_high_s16( v16_d2 ), vget_high_s16( v16_d2 ) ); + + v16_ssd0 = vmlal_s16( v16_ssd0, vget_low_s16( v16_d3 ), vget_low_s16( v16_d3 ) ); + v16_ssd1 = vmlal_s16( v16_ssd1, vget_high_s16( v16_d3 ), vget_high_s16( v16_d3 ) ); + + v16_d0 = vreinterpretq_s16_u16( vsubl_u8( vld1_u8( pui8_blk1 + ( 4 * i_blk1_stride ) ), vld1_u8( pui8_blk2 + ( 4 * i_blk2_stride ) ) ) ); + v16_d1 = vreinterpretq_s16_u16( vsubl_u8( vld1_u8( pui8_blk1 + ( 5 * i_blk1_stride ) ), vld1_u8( pui8_blk2 + ( 5 * i_blk2_stride ) ) ) ); + v16_d2 = vreinterpretq_s16_u16( vsubl_u8( vld1_u8( pui8_blk1 + ( 6 * i_blk1_stride ) ), vld1_u8( pui8_blk2 + ( 6 * i_blk2_stride ) ) ) ); + v16_d3 = vreinterpretq_s16_u16( vsubl_u8( vld1_u8( pui8_blk1 + ( 7 * i_blk1_stride ) ), vld1_u8( pui8_blk2 + ( 7 * i_blk2_stride ) ) ) ); + + v16_ssd0 = vmlal_s16( v16_ssd0, vget_low_s16( v16_d0 ), vget_low_s16( v16_d0 ) ); + v16_ssd1 = vmlal_s16( v16_ssd1, vget_high_s16( v16_d0 ), vget_high_s16( v16_d0 ) ); + + v16_ssd0 = vmlal_s16( v16_ssd0, vget_low_s16( v16_d1 ), vget_low_s16( v16_d1 ) ); + v16_ssd1 = vmlal_s16( v16_ssd1, vget_high_s16( v16_d1 ), vget_high_s16( v16_d1 ) ); + + v16_ssd0 = vmlal_s16( v16_ssd0, vget_low_s16( v16_d2 ), vget_low_s16( v16_d2 ) ); + v16_ssd1 = vmlal_s16( v16_ssd1, vget_high_s16( v16_d2 ), vget_high_s16( v16_d2 ) ); + + v16_ssd0 = vmlal_s16( v16_ssd0, vget_low_s16( v16_d3 ), vget_low_s16( v16_d3 ) ); + v16_ssd1 = vmlal_s16( v16_ssd1, vget_high_s16( v16_d3 ), vget_high_s16( v16_d3 ) ); + + v16_ssd0 = vaddq_s32( v16_ssd0, v16_ssd1 ); + + v8_hadd0 = vadd_s32( vget_low_s32( v16_ssd0 ), vget_high_s32( v16_ssd0 ) ); + v8_hadd0 = vpadd_s32( v8_hadd0, v8_hadd0 ); + + i_ssd = vget_lane_s32( v8_hadd0, 0 ); + + return i_ssd; +} + +int32_t y262_ssd_16x16_neon( uint8_t *pui8_blk1, int32_t i_blk1_stride, uint8_t *pui8_blk2, int32_t i_blk2_stride ) +{ + int32_t i_ssd; + + i_ssd = y262_ssd_8x8_neon( pui8_blk1, i_blk1_stride, pui8_blk2, i_blk2_stride ); + i_ssd += y262_ssd_8x8_neon( pui8_blk1 + 8, i_blk1_stride, pui8_blk2 + 8, i_blk2_stride ); + i_ssd += y262_ssd_8x8_neon( pui8_blk1 + ( 8 * i_blk1_stride ), i_blk1_stride, pui8_blk2 + ( 8 * i_blk2_stride ), i_blk2_stride ); + i_ssd += y262_ssd_8x8_neon( pui8_blk1 + 8 + ( 8 * i_blk1_stride ), i_blk1_stride, pui8_blk2 + 8 + ( 8 * i_blk2_stride), i_blk2_stride ); + return i_ssd; +} + + +void y262_sub_8x8_neon( int16_t *pi16_diff, uint8_t *pui8_src1, int32_t i_stride_src1, uint8_t *pui8_src2, int32_t i_stride_src2 ) +{ + int16x8_t v16_d0, v16_d1, v16_d2, v16_d3; + + v16_d0 = vreinterpretq_s16_u16( vsubl_u8( vld1_u8( pui8_src1 + ( 0 * i_stride_src1 ) ), vld1_u8( pui8_src2 + ( 0 * i_stride_src2 ) ) ) ); + v16_d1 = vreinterpretq_s16_u16( vsubl_u8( vld1_u8( pui8_src1 + ( 1 * i_stride_src1 ) ), vld1_u8( pui8_src2 + ( 1 * i_stride_src2 ) ) ) ); + v16_d2 = vreinterpretq_s16_u16( vsubl_u8( vld1_u8( pui8_src1 + ( 2 * i_stride_src1 ) ), vld1_u8( pui8_src2 + ( 2 * i_stride_src2 ) ) ) ); + v16_d3 = vreinterpretq_s16_u16( vsubl_u8( vld1_u8( pui8_src1 + ( 3 * i_stride_src1 ) ), vld1_u8( pui8_src2 + ( 3 * i_stride_src2 ) ) ) ); + + vst1q_s16( pi16_diff + 0, v16_d0 ); + vst1q_s16( pi16_diff + 8, v16_d1 ); + vst1q_s16( pi16_diff + 16, v16_d2 ); + vst1q_s16( pi16_diff + 24, v16_d3 ); + + v16_d0 = vreinterpretq_s16_u16( vsubl_u8( vld1_u8( pui8_src1 + ( 4 * i_stride_src1 ) ), vld1_u8( pui8_src2 + ( 4 * i_stride_src2 ) ) ) ); + v16_d1 = vreinterpretq_s16_u16( vsubl_u8( vld1_u8( pui8_src1 + ( 5 * i_stride_src1 ) ), vld1_u8( pui8_src2 + ( 5 * i_stride_src2 ) ) ) ); + v16_d2 = vreinterpretq_s16_u16( vsubl_u8( vld1_u8( pui8_src1 + ( 6 * i_stride_src1 ) ), vld1_u8( pui8_src2 + ( 6 * i_stride_src2 ) ) ) ); + v16_d3 = vreinterpretq_s16_u16( vsubl_u8( vld1_u8( pui8_src1 + ( 7 * i_stride_src1 ) ), vld1_u8( pui8_src2 + ( 7 * i_stride_src2 ) ) ) ); + + vst1q_s16( pi16_diff + 32, v16_d0 ); + vst1q_s16( pi16_diff + 40, v16_d1 ); + vst1q_s16( pi16_diff + 48, v16_d2 ); + vst1q_s16( pi16_diff + 56, v16_d3 ); +} + +void y262_add_8x8_neon( uint8_t *pui8_destination, int32_t i_destination_stride, uint8_t *pui8_base, int32_t i_base_stride, int16_t *pi16_difference ) +{ + int32_t i_y; + + int16x8_t v16_zero = vmovq_n_s16( 0 ); + + for( i_y = 0; i_y < 8; i_y += 2 ) + { + int16x8_t v16_d0, v16_b0, v16_d1, v16_b1; + + v16_d0 = vld1q_s16( pi16_difference + ( i_y * 8 ) ); + v16_d1 = vld1q_s16( pi16_difference + ( ( i_y + 1 ) * 8 ) ); + v16_b0 = vreinterpretq_s16_u16( vshll_n_u8( vld1_u8( pui8_base + ( i_y * i_base_stride ) ), 0 ) ); + v16_b1 = vreinterpretq_s16_u16( vshll_n_u8( vld1_u8( pui8_base + ( ( i_y + 1 ) * i_base_stride ) ), 0 ) ); + + v16_d0 = vaddq_s16( v16_d0, v16_b0 ); + v16_d1 = vaddq_s16( v16_d1, v16_b1 ); + + v16_d0 = vmaxq_s16( v16_zero, v16_d0 ); + v16_d1 = vmaxq_s16( v16_zero, v16_d1 ); + + vst1_u8( pui8_destination + ( i_y * i_destination_stride ), vqmovn_u16( vreinterpretq_u16_s16( v16_d0 ) ) ); + vst1_u8( pui8_destination + ( ( i_y + 1 ) * i_destination_stride ), vqmovn_u16( vreinterpretq_u16_s16( v16_d1 ) ) ); + } +} + + +/* MC */ + +#define MC_FUNC_NEON( name, i_width, i_height, hpelidx ) \ +void y262_motcomp_##name##_put_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride ) \ +{ \ + int32_t i_x, i_y; \ + \ + if( hpelidx == 0 ) \ + { \ + for( i_y = 0; i_y < i_height; i_y++ ) \ + { \ + if( i_width == 8 ) \ + { \ + uint8x8_t v8_a; \ + v8_a = vld1_u8( pui8_src ); \ + vst1_u8( pui8_dst, v8_a ); \ + } \ + else if( i_width == 16 ) \ + { \ + uint8x16_t v16_a; \ + v16_a = vld1q_u8( pui8_src ); \ + vst1q_u8( pui8_dst, v16_a ); \ + } \ + pui8_src += i_src_stride; \ + pui8_dst += i_dst_stride; \ + } \ + } \ + else if( hpelidx == 1 ) \ + { \ + uint8_t *pui8_src1, *pui8_src2; \ + \ + pui8_src1 = pui8_src; \ + pui8_src2 = pui8_src + 1; \ + \ + for( i_y = 0; i_y < i_height; i_y++ ) \ + { \ + if( i_width == 8 ) \ + { \ + uint8x8_t v8_a, v8_b; \ + v8_a = vld1_u8( pui8_src1 ); \ + v8_b = vld1_u8( pui8_src2 ); \ + vst1_u8( pui8_dst, vrhadd_u8( v8_a, v8_b ) ); \ + } \ + else if( i_width == 16 ) \ + { \ + uint8x16_t v16_a, v16_b; \ + v16_a = vld1q_u8( pui8_src1 ); \ + v16_b = vld1q_u8( pui8_src2 ); \ + vst1q_u8( pui8_dst, vrhaddq_u8( v16_a, v16_b ) ); \ + } \ + pui8_src1 += i_src_stride; \ + pui8_src2 += i_src_stride; \ + pui8_dst += i_dst_stride; \ + } \ + } \ + else if( hpelidx == 2 ) \ + { \ + uint8_t *pui8_src1, *pui8_src2; \ + \ + pui8_src1 = pui8_src; \ + pui8_src2 = pui8_src + i_src_stride; \ + \ + for( i_y = 0; i_y < i_height; i_y++ ) \ + { \ + if( i_width == 8 ) \ + { \ + uint8x8_t v8_a, v8_b; \ + v8_a = vld1_u8( pui8_src1 ); \ + v8_b = vld1_u8( pui8_src2 ); \ + vst1_u8( pui8_dst, vrhadd_u8( v8_a, v8_b ) ); \ + } \ + else if( i_width == 16 ) \ + { \ + uint8x16_t v16_a, v16_b; \ + v16_a = vld1q_u8( pui8_src1 ); \ + v16_b = vld1q_u8( pui8_src2 ); \ + vst1q_u8( pui8_dst, vrhaddq_u8( v16_a, v16_b ) ); \ + } \ + pui8_src1 += i_src_stride; \ + pui8_src2 += i_src_stride; \ + pui8_dst += i_dst_stride; \ + } \ + } \ + else \ + { \ + uint8_t *pui8_src1, *pui8_src2, *pui8_src3, *pui8_src4; \ + uint8x8_t v8_a, v8_b, v8_c, v8_d; \ + uint8x16_t v16_a, v16_b, v16_c, v16_d; \ + uint8x8_t v8_one = vmov_n_u8( 1 ); \ + uint8x16_t v16_one = vmovq_n_u8( 1 ); \ + \ + pui8_src1 = pui8_src; \ + pui8_src2 = pui8_src + 1; \ + pui8_src3 = pui8_src + i_src_stride; \ + pui8_src4 = pui8_src + i_src_stride + 1; \ + \ + if( i_width == 8 ) \ + { \ + v8_a = vld1_u8( pui8_src1 ); \ + v8_b = vld1_u8( pui8_src2 ); \ + } \ + else if( i_width == 16 ) \ + { \ + v16_a = vld1q_u8( pui8_src1 ); \ + v16_b = vld1q_u8( pui8_src2 ); \ + } \ + for( i_y = 0; i_y < i_height; i_y++ ) \ + { \ + if( i_width == 8 ) \ + { \ + uint8x8_t v8_carry0, v8_carry1; \ + v8_c = vld1_u8( pui8_src3 ); \ + v8_d = vld1_u8( pui8_src4 ); \ + \ + v8_carry0 = veor_u8( v8_a, v8_c); \ + v8_carry1 = veor_u8( v8_b, v8_d); \ + \ + v8_a = vrhadd_u8( v8_a, v8_c ); \ + v8_b = vrhadd_u8( v8_b, v8_d ); \ + v8_carry0 = vorr_u8( v8_carry0, v8_carry1 ); \ + \ + v8_carry1 = veor_u8( v8_a, v8_b); \ + v8_carry0 = vand_u8( v8_carry0, v8_carry1 ); \ + v8_carry0 = vand_u8( v8_carry0, v8_one ); \ + \ + v8_a = vrhadd_u8( v8_a, v8_b ); \ + v8_a = vsub_u8( v8_a, v8_carry0 ); \ + \ + vst1_u8( pui8_dst, v8_a ); \ + \ + v8_a = v8_c; \ + v8_b = v8_d; \ + } \ + else if( i_width == 16 ) \ + { \ + uint8x16_t v16_carry0, v16_carry1; \ + v16_c = vld1q_u8( pui8_src3 ); \ + v16_d = vld1q_u8( pui8_src4 ); \ + \ + v16_carry0 = veorq_u8( v16_a, v16_c); \ + v16_carry1 = veorq_u8( v16_b, v16_d); \ + \ + v16_a = vrhaddq_u8( v16_a, v16_c ); \ + v16_b = vrhaddq_u8( v16_b, v16_d ); \ + v16_carry0 = vorrq_u8( v16_carry0, v16_carry1 ); \ + \ + v16_carry1 = veorq_u8( v16_a, v16_b); \ + v16_carry0 = vandq_u8( v16_carry0, v16_carry1 ); \ + v16_carry0 = vandq_u8( v16_carry0, v16_one ); \ + \ + v16_a = vrhaddq_u8( v16_a, v16_b ); \ + v16_a = vsubq_u8( v16_a, v16_carry0 ); \ + \ + vst1q_u8( pui8_dst, v16_a ); \ + \ + v16_a = v16_c; \ + v16_b = v16_d; \ + } \ + pui8_src1 += i_src_stride; \ + pui8_src2 += i_src_stride; \ + pui8_src3 += i_src_stride; \ + pui8_src4 += i_src_stride; \ + pui8_dst += i_dst_stride; \ + } \ + } \ +} \ + \ + \ +void y262_motcomp_##name##_avg_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride ) \ +{ \ + int32_t i_x, i_y; \ + \ + if( hpelidx == 0 ) \ + { \ + for( i_y = 0; i_y < i_height; i_y++ ) \ + { \ + if( i_width == 8 ) \ + { \ + uint8x8_t v8_a, v8_z; \ + v8_a = vld1_u8( pui8_src ); \ + v8_z = vld1_u8( pui8_dst ); \ + vst1_u8( pui8_dst, vrhadd_u8( v8_a, v8_z ) ); \ + } \ + else if( i_width == 16 ) \ + { \ + uint8x16_t v16_a, v16_z; \ + v16_a = vld1q_u8( pui8_src ); \ + v16_z = vld1q_u8( pui8_dst ); \ + vst1q_u8( pui8_dst, vrhaddq_u8( v16_a, v16_z ) ); \ + } \ + pui8_src += i_src_stride; \ + pui8_dst += i_dst_stride; \ + } \ + } \ + else if( hpelidx == 1 ) \ + { \ + uint8_t *pui8_src1, *pui8_src2; \ + \ + pui8_src1 = pui8_src; \ + pui8_src2 = pui8_src + 1; \ + \ + for( i_y = 0; i_y < i_height; i_y++ ) \ + { \ + if( i_width == 8 ) \ + { \ + uint8x8_t v8_a, v8_b, v8_z; \ + v8_a = vld1_u8( pui8_src1 ); \ + v8_b = vld1_u8( pui8_src2 ); \ + v8_z = vld1_u8( pui8_dst ); \ + v8_a = vrhadd_u8( v8_a, v8_b ); \ + vst1_u8( pui8_dst, vrhadd_u8( v8_a, v8_z ) ); \ + } \ + else if( i_width == 16 ) \ + { \ + uint8x16_t v16_a, v16_b, v16_z; \ + v16_a = vld1q_u8( pui8_src1 ); \ + v16_b = vld1q_u8( pui8_src2 ); \ + v16_z = vld1q_u8( pui8_dst ); \ + v16_a = vrhaddq_u8( v16_a, v16_b ); \ + vst1q_u8( pui8_dst, vrhaddq_u8( v16_a, v16_z ) ); \ + } \ + pui8_src1 += i_src_stride; \ + pui8_src2 += i_src_stride; \ + pui8_dst += i_dst_stride; \ + } \ + } \ + else if( hpelidx == 2 ) \ + { \ + uint8_t *pui8_src1, *pui8_src2; \ + \ + pui8_src1 = pui8_src; \ + pui8_src2 = pui8_src + i_src_stride; \ + \ + for( i_y = 0; i_y < i_height; i_y++ ) \ + { \ + if( i_width == 8 ) \ + { \ + uint8x8_t v8_a, v8_b, v8_z; \ + v8_a = vld1_u8( pui8_src1 ); \ + v8_b = vld1_u8( pui8_src2 ); \ + v8_z = vld1_u8( pui8_dst ); \ + v8_a = vrhadd_u8( v8_a, v8_b ); \ + vst1_u8( pui8_dst, vrhadd_u8( v8_a, v8_z ) ); \ + } \ + else if( i_width == 16 ) \ + { \ + uint8x16_t v16_a, v16_b, v16_z; \ + v16_a = vld1q_u8( pui8_src1 ); \ + v16_b = vld1q_u8( pui8_src2 ); \ + v16_z = vld1q_u8( pui8_dst ); \ + v16_a = vrhaddq_u8( v16_a, v16_b ); \ + vst1q_u8( pui8_dst, vrhaddq_u8( v16_a, v16_z ) ); \ + } \ + pui8_src1 += i_src_stride; \ + pui8_src2 += i_src_stride; \ + pui8_dst += i_dst_stride; \ + } \ + } \ + else \ + { \ + uint8_t *pui8_src1, *pui8_src2, *pui8_src3, *pui8_src4; \ + uint8x8_t v8_a, v8_b, v8_c, v8_d, v8_z; \ + uint8x16_t v16_a, v16_b, v16_c, v16_d, v16_z; \ + uint8x8_t v8_one = vmov_n_u8( 1 ); \ + uint8x16_t v16_one = vmovq_n_u8( 1 ); \ + \ + pui8_src1 = pui8_src; \ + pui8_src2 = pui8_src + 1; \ + pui8_src3 = pui8_src + i_src_stride; \ + pui8_src4 = pui8_src + i_src_stride + 1; \ + \ + if( i_width == 8 ) \ + { \ + v8_a = vld1_u8( pui8_src1 ); \ + v8_b = vld1_u8( pui8_src2 ); \ + } \ + else if( i_width == 16 ) \ + { \ + v16_a = vld1q_u8( pui8_src1 ); \ + v16_b = vld1q_u8( pui8_src2 ); \ + } \ + for( i_y = 0; i_y < i_height; i_y++ ) \ + { \ + if( i_width == 8 ) \ + { \ + uint8x8_t v8_carry0, v8_carry1; \ + v8_c = vld1_u8( pui8_src3 ); \ + v8_d = vld1_u8( pui8_src4 ); \ + v8_z = vld1_u8( pui8_dst ); \ + \ + v8_carry0 = veor_u8( v8_a, v8_c); \ + v8_carry1 = veor_u8( v8_b, v8_d); \ + \ + v8_a = vrhadd_u8( v8_a, v8_c ); \ + v8_b = vrhadd_u8( v8_b, v8_d ); \ + v8_carry0 = vorr_u8( v8_carry0, v8_carry1 ); \ + \ + v8_carry1 = veor_u8( v8_a, v8_b); \ + v8_carry0 = vand_u8( v8_carry0, v8_carry1 ); \ + v8_carry0 = vand_u8( v8_carry0, v8_one ); \ + \ + v8_a = vrhadd_u8( v8_a, v8_b ); \ + v8_a = vsub_u8( v8_a, v8_carry0 ); \ + \ + vst1_u8( pui8_dst, vrhadd_u8( v8_a, v8_z ) ); \ + \ + v8_a = v8_c; \ + v8_b = v8_d; \ + } \ + else if( i_width == 16 ) \ + { \ + uint8x16_t v16_carry0, v16_carry1; \ + v16_c = vld1q_u8( pui8_src3 ); \ + v16_d = vld1q_u8( pui8_src4 ); \ + v16_z = vld1q_u8( pui8_dst ); \ + \ + v16_carry0 = veorq_u8( v16_a, v16_c); \ + v16_carry1 = veorq_u8( v16_b, v16_d); \ + \ + v16_a = vrhaddq_u8( v16_a, v16_c ); \ + v16_b = vrhaddq_u8( v16_b, v16_d ); \ + v16_carry0 = vorrq_u8( v16_carry0, v16_carry1 ); \ + \ + v16_carry1 = veorq_u8( v16_a, v16_b); \ + v16_carry0 = vandq_u8( v16_carry0, v16_carry1 ); \ + v16_carry0 = vandq_u8( v16_carry0, v16_one ); \ + \ + v16_a = vrhaddq_u8( v16_a, v16_b ); \ + v16_a = vsubq_u8( v16_a, v16_carry0 ); \ + \ + vst1q_u8( pui8_dst, vrhaddq_u8( v16_a, v16_z ) ); \ + \ + v16_a = v16_c; \ + v16_b = v16_d; \ + } \ + pui8_src1 += i_src_stride; \ + pui8_src2 += i_src_stride; \ + pui8_src3 += i_src_stride; \ + pui8_src4 += i_src_stride; \ + pui8_dst += i_dst_stride; \ + } \ + } \ +} \ + + +MC_FUNC_NEON( 16x16_00, 16, 16, 0 ); +MC_FUNC_NEON( 16x16_01, 16, 16, 1 ); +MC_FUNC_NEON( 16x16_10, 16, 16, 2 ); +MC_FUNC_NEON( 16x16_11, 16, 16, 3 ); + +MC_FUNC_NEON( 16x8_00, 16, 8, 0 ); +MC_FUNC_NEON( 16x8_01, 16, 8, 1 ); +MC_FUNC_NEON( 16x8_10, 16, 8, 2 ); +MC_FUNC_NEON( 16x8_11, 16, 8, 3 ); + +MC_FUNC_NEON( 8x16_00, 8, 16, 0 ); +MC_FUNC_NEON( 8x16_01, 8, 16, 1 ); +MC_FUNC_NEON( 8x16_10, 8, 16, 2 ); +MC_FUNC_NEON( 8x16_11, 8, 16, 3 ); + +MC_FUNC_NEON( 8x8_00, 8, 8, 0 ); +MC_FUNC_NEON( 8x8_01, 8, 8, 1 ); +MC_FUNC_NEON( 8x8_10, 8, 8, 2 ); +MC_FUNC_NEON( 8x8_11, 8, 8, 3 ); + +MC_FUNC_NEON( 8x4_00, 8, 4, 0 ); +MC_FUNC_NEON( 8x4_01, 8, 4, 1 ); +MC_FUNC_NEON( 8x4_10, 8, 4, 2 ); +MC_FUNC_NEON( 8x4_11, 8, 4, 3 ); + + + + + + + diff --git a/src/y262/pixelop_arm64.h b/src/y262/pixelop_arm64.h new file mode 100644 index 0000000..99d156a --- /dev/null +++ b/src/y262/pixelop_arm64.h @@ -0,0 +1,42 @@ +/* +Copyright (c) 2013, Ralf Willenbacher +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +*/ + +int32_t y262_sad_16x16_neon( uint8_t *pui8_blk1, int32_t i_stride1, uint8_t *pui8_blk2, int32_t i_stride2 ); +int32_t y262_sad_16x8_neon( uint8_t *pui8_blk1, int32_t i_stride1, uint8_t *pui8_blk2, int32_t i_stride2 ); + +int32_t y262_satd_16x16_neon( uint8_t *pui8_blk1, int32_t i_stride1, uint8_t *pui8_blk2, int32_t i_stride2 ); +int32_t y262_satd_16x8_neon( uint8_t *pui8_blk1, int32_t i_stride1, uint8_t *pui8_blk2, int32_t i_stride2 ); + +int32_t y262_ssd_8x8_neon( uint8_t *pui8_blk1, int32_t i_blk1_stride, uint8_t *pui8_blk2, int32_t i_blk2_stride ); +int32_t y262_ssd_16x16_neon( uint8_t *pui8_blk1, int32_t i_blk1_stride, uint8_t *pui8_blk2, int32_t i_blk2_stride ); + +void y262_sub_8x8_neon( int16_t *pi16_diff, uint8_t *pui8_src1, int32_t i_stride_src1, uint8_t *pui8_src2, int32_t i_stride_src2 ); +void y262_add_8x8_neon( uint8_t *pui8_destination, int32_t i_destination_stride, uint8_t *pui8_base, int32_t i_base_stride, int16_t *pi16_difference ); + diff --git a/src/y262/transform_arm64.c b/src/y262/transform_arm64.c new file mode 100644 index 0000000..ea7684c --- /dev/null +++ b/src/y262/transform_arm64.c @@ -0,0 +1,444 @@ +/* +Copyright (c) 2013, Ralf Willenbacher +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +*/ + +//#ifdef ASSEMBLY_ARM64 + +#include + +#include "y262.h" + + +#define RND1BITS ( 11 ) +#define RND2BITS ( 31 - RND1BITS ) + + +static const int16_t rgi16_y262_fdct_neon_cs1[ 8 ][ 8 ] = { + { 16383, 16383, 16383, 16383, 16383, 16383, 16383, 16383 }, + { 22724, 19265, 12872, 4520, -4520, -12872, -19265, -22724 }, + { 21406, 8867, -8867, -21406, -21406, -8867, 8867, 21406 }, + { 19265, -4520, -22724, -12872, 12872, 22724, 4520, -19265 }, + { 16383, -16383, -16383, 16383, 16383, -16383, -16383, 16383 }, + { 12872, -22724, 4520, 19265, -19265, -4520, 22724, -12872 }, + { 8867, -21406, 21406, -8867, -8867, 21406, -21406, 8867 }, + { 4520, -12872, 19265, -22724, 22724, -19265, 12872, -4520 }, +}; +static const int16_t rgi16_y262_fdct_neon_cs2[ 32 ] = { + 16385, 16385, 22726, 19266, -8867, -21408, -22726, -12873, + 16385, 16385, 12873, 4521, 21408, 8867, 19266, -4521, + 16385, -16385, 12873, -22726, 21408, -8867, 19266, -22726, + -16385, 16385, 4521, 19266, 8867, -21408, 4521, -12873, +}; + + +void y262_fdct_neon( int16_t *pi16_block, int16_t *pi16_dst ) +{ + int i_i, i_j, i_k; + int16x8_t rgv16_tmp[ 8 ], rgv16_dsts[ 8 ]; + int16x8_t rgv16_e[ 4 ], rgv16_ee[ 2 ]; + int32x4x2_t rgv32_transA[ 4 ]; + int16x8x2_t rgv32_transB[ 4 ]; + int16x4_t v8_mt0, v8_mt1, v8_mt2, v8_mt3, v8_mt4, v8_mt5, v8_mt6, v8_mt7; + +#define RND( x, y ) ( ( ( x ) + ( ( y ) ? ( 1 << ( y - 1 ) ) : 0 ) ) >> ( y ) ) +#define MUL( x, m ) ( ( x ) * ( m ) ) + + for( i_j = 1; i_j < 8; i_j += 2 ) + { + int16x8_t v16_d; + int32x4_t v16_s0, v16_s1; + + v16_d = vsubq_s16( vld1q_s16( &pi16_block[ 8 * 0 ] ), vld1q_s16( &pi16_block[ 8 * 7 ] ) ); + v16_s0 = vmull_n_s16( vget_low_s16( v16_d ), rgi16_y262_fdct_neon_cs1[ i_j ][ 0 ] ); + v16_s1 = vmull_n_s16( vget_high_s16( v16_d ), rgi16_y262_fdct_neon_cs1[ i_j ][ 0 ] ); + + for( i_k = 1; i_k < 4; i_k++ ) + { + v16_d = vsubq_s16( vld1q_s16( &pi16_block[ 8 * i_k ] ), vld1q_s16( &pi16_block[ 8 * ( 7 - i_k ) ] ) ); + v16_s0 = vmlal_n_s16( v16_s0, vget_low_s16( v16_d ), rgi16_y262_fdct_neon_cs1[ i_j ][ i_k ] ); + v16_s1 = vmlal_n_s16( v16_s1, vget_high_s16( v16_d ), rgi16_y262_fdct_neon_cs1[ i_j ][ i_k ] ); + } + + rgv16_tmp[ i_j ] = vcombine_s16( vqrshrn_n_s32( v16_s0, RND1BITS ), vqrshrn_n_s32( v16_s1, RND1BITS ) ); + } + + for ( i_k = 0; i_k < 4; i_k++ ) + { + rgv16_e[ i_k ] = vaddq_s16( vld1q_s16( &pi16_block[ 8 * i_k ] ), vld1q_s16( &pi16_block[ 8 * ( 7 - i_k ) ] ) ); + } + + for( i_j = 2; i_j < 8; i_j += 4 ) + { + int16x8_t v16_d; + int32x4_t v16_s0, v16_s1; + + v16_d = vsubq_s16( rgv16_e[ 0 ], rgv16_e[ 3 ] ); + v16_s0 = vmull_n_s16( vget_low_s16( v16_d ), rgi16_y262_fdct_neon_cs1[ i_j ][ 0 ] ); + v16_s1 = vmull_n_s16( vget_high_s16( v16_d ), rgi16_y262_fdct_neon_cs1[ i_j ][ 0 ] ); + + v16_d = vsubq_s16( rgv16_e[ 1 ], rgv16_e[ 2 ] ); + v16_s0 = vmlal_n_s16( v16_s0, vget_low_s16( v16_d ), rgi16_y262_fdct_neon_cs1[ i_j ][ 1 ] ); + v16_s1 = vmlal_n_s16( v16_s1, vget_high_s16( v16_d ), rgi16_y262_fdct_neon_cs1[ i_j ][ 1 ] ); + + rgv16_tmp[ i_j ] = vcombine_s16( vqrshrn_n_s32( v16_s0, RND1BITS ), vqrshrn_n_s32( v16_s1, RND1BITS ) ); + } + for ( i_k = 0; i_k < 2; i_k++ ) + { + rgv16_ee[ i_k ] = vaddq_s16( rgv16_e[ i_k ], rgv16_e[ 3 - i_k ] ); + } + for( i_j = 0; i_j < 8; i_j += 4 ) + { + int16x8_t v16_d; + int32x4_t v16_s0, v16_s1; + + v16_s0 = vmull_n_s16( vget_low_s16( rgv16_ee[ 0 ] ), rgi16_y262_fdct_neon_cs1[ i_j ][ 0 ] ); + v16_s1 = vmull_n_s16( vget_high_s16( rgv16_ee[ 0 ] ), rgi16_y262_fdct_neon_cs1[ i_j ][ 0 ] ); + + v16_s0 = vmlal_n_s16( v16_s0, vget_low_s16( rgv16_ee[ 1 ] ), rgi16_y262_fdct_neon_cs1[ i_j ][ 1 ] ); + v16_s1 = vmlal_n_s16( v16_s1, vget_high_s16( rgv16_ee[ 1 ] ), rgi16_y262_fdct_neon_cs1[ i_j ][ 1 ] ); + + rgv16_tmp[ i_j ] = vcombine_s16( vqrshrn_n_s32( v16_s0, RND1BITS ), vqrshrn_n_s32( v16_s1, RND1BITS ) ); + } + + v8_mt0 = vld1_s16( &rgi16_y262_fdct_neon_cs2[ 0 ] ); + v8_mt1 = vld1_s16( &rgi16_y262_fdct_neon_cs2[ 4 ] ); + v8_mt2 = vld1_s16( &rgi16_y262_fdct_neon_cs2[ 8 ] ); + v8_mt3 = vld1_s16( &rgi16_y262_fdct_neon_cs2[ 12 ] ); + v8_mt4 = vld1_s16( &rgi16_y262_fdct_neon_cs2[ 16 ] ); + v8_mt5 = vld1_s16( &rgi16_y262_fdct_neon_cs2[ 20 ] ); + v8_mt6 = vld1_s16( &rgi16_y262_fdct_neon_cs2[ 24 ] ); + v8_mt7 = vld1_s16( &rgi16_y262_fdct_neon_cs2[ 28 ] ); + + for( i_j = 0; i_j < 8; i_j++ ) + { + int16x4_t v8_l0, v8_l1, v8_o, v8_e, v8_m0, v8_m1, v8_m2, v8_m3; + int16x4x2_t v16_trn0; + int32x2x2_t v16_trn1; + int32x4_t v16_s0, v16_s1, v16_s2, v16_s3; + int16x8_t v16_row; + + v8_l0 = vget_low_s16( rgv16_tmp[ i_j ] ); + v8_l1 = vget_high_s16( rgv16_tmp[ i_j ] ); + v8_l1 = vrev64_s16( v8_l1 ); + v8_o = vsub_s16( v8_l0, v8_l1 ); + v8_e = vadd_s16( v8_l0, v8_l1 ); + + v16_trn1 = vzip_s32( vreinterpret_s32_s16( v8_e ), vreinterpret_s32_s16( v8_o ) ); + v8_m0 = vreinterpret_s16_s32( v16_trn1.val[ 0 ] ); + v8_m1 = vreinterpret_s16_s32( v16_trn1.val[ 1 ] ); + + v16_s0 = vmull_s16( v8_m0, v8_mt0 ); + v16_s1 = vmull_s16( v8_m1, v8_mt1 ); + v16_s0 = vmlal_s16( v16_s0, v8_m1, v8_mt2 ); + v16_s1 = vmlal_s16( v16_s1, v8_m0, v8_mt3 ); + + v16_s2 = vmull_s16( v8_m0, v8_mt4 ); + v16_s3 = vmull_s16( v8_m1, v8_mt5 ); + v16_s2 = vmlal_s16( v16_s2, v8_m1, v8_mt6 ); + v16_s3 = vmlal_s16( v16_s3, v8_m0, v8_mt7 ); + + v16_s0 = vpaddq_s32( v16_s0, v16_s1 ); + v16_s1 = vpaddq_s32( v16_s2, v16_s3 ); + + v16_row = vcombine_s16( vmovn_s32( vrshrq_n_s32( v16_s0, RND2BITS ) ), vmovn_s32(vrshrq_n_s32( v16_s1, RND2BITS ) ) ); + vst1q_s16( pi16_dst + ( 8 * i_j ), v16_row ); + } +} + + +#define RND1BITS ( 11 ) +#define RND2BITS ( 31 - RND1BITS ) + +static const int16_t rgi16_y262_idct_cs1[ 8 ][ 8 ] = { + { 16383, 16383, 16383, 16383, 16383, 16383, 16383, 16383 }, + { 22724, 19265, 12872, 4520, -4520, -12872, -19265, -22724 }, + { 21406, 8867, -8867, -21406, -21406, -8867, 8867, 21406 }, + { 19265, -4520, -22724, -12872, 12872, 22724, 4520, -19265 }, + { 16383, -16383, -16383, 16383, 16383, -16383, -16383, 16383 }, + { 12872, -22724, 4520, 19265, -19265, -4520, 22724, -12872 }, + { 8867, -21406, 21406, -8867, -8867, 21406, -21406, 8867 }, + { 4520, -12872, 19265, -22724, 22724, -19265, 12872, -4520 }, +}; +static const int16_t rgi16_y262_idct_cs2[ 8 ][ 8 ] = { + { 16385, 16385, 16385, 16385, 16385, 16385, 16385, 16385 }, + { 22726, 19266, 12873, 4521, -4521, -12873, -19266, -22726 }, + { 21408, 8867, -8867, -21408, -21408, -8867, 8867, 21408 }, + { 19266, -4521, -22726, -12873, 12873, 22726, 4521, -19266 }, + { 16385, -16385, -16385, 16385, 16385, -16385, -16385, 16385 }, + { 12873, -22726, 4521, 19266, -19266, -4521, 22726, -12873 }, + { 8867, -21408, 21408, -8867, -8867, 21408, -21408, 8867 }, + { 4521, -12873, 19266, -22726, 22726, -19266, 12873, -4521 }, +}; + +static const int16_t rgi16_y262_idct_neon_cs2[ 32 ] = { + 16385, 21408, 16385, 8867, 16385, -8867, 16385, -21408, + 16385, 8867, -16385, -21408, -16385, 21408, 16385, -8867, + 22726, 19266, 19266, -4521, 12873, -22726, 4521, -12873, + 12873, 4521, -22726, -12873, 4521, 19266, 19266, -22726 +}; + + + +void y262_idct_neon( int16_t *pi16_block, int16_t *pi16_dst ) +{ + int i_j, i_k; + int16_t rgi16_tmp[ 64 ]; + int32_t rgi_e[ 4 ], rgi_o[ 4 ]; + int32_t rgi_ee[ 2 ], rgi_eo[ 2 ]; + int32x4_t rgv16_o[ 4 ]; + int32x4_t rgv16_eo[ 4 ]; + int32x4_t rgv16_ee[ 4 ]; + int32x4_t rgv16_e[ 4 ]; + int16x4_t rgv8_tmp[ 8 ][ 2 ]; + int16x4_t v8_mt0, v8_mt1, v8_mt2, v8_mt3, v8_mt4, v8_mt5, v8_mt6, v8_mt7; + +#define RND( x, y ) ( ( ( x ) + ( ( y ) ? ( 1 << ( y - 1 ) ) : 0 ) ) >> ( y ) ) +#define MUL( x, m ) ( ( x ) * ( m ) ) + + + for( i_j = 0; i_j < 2; i_j++ ) + { + int16x4_t v8_b0, v8_b1, v8_b2, v8_b3; + + v8_b0 = vld1_s16( pi16_block + ( 8 * 1 ) + ( i_j * 4 ) ); + v8_b1 = vld1_s16( pi16_block + ( 8 * 3 ) + ( i_j * 4 ) ); + v8_b2 = vld1_s16( pi16_block + ( 8 * 5 ) + ( i_j * 4 ) ); + v8_b3 = vld1_s16( pi16_block + ( 8 * 7 ) + ( i_j * 4 ) ); + + rgv16_o[ 0 ] = vmull_n_s16( v8_b0, rgi16_y262_idct_cs1[ 1 ][ 0 ] ); + rgv16_o[ 0 ] = vmlal_n_s16( rgv16_o[ 0 ], v8_b1, rgi16_y262_idct_cs1[ 3 ][ 0 ] ); + rgv16_o[ 0 ] = vmlal_n_s16( rgv16_o[ 0 ], v8_b2, rgi16_y262_idct_cs1[ 5 ][ 0 ] ); + rgv16_o[ 0 ] = vmlal_n_s16( rgv16_o[ 0 ], v8_b3, rgi16_y262_idct_cs1[ 7 ][ 0 ] ); + + rgv16_o[ 1 ] = vmull_n_s16( v8_b0, rgi16_y262_idct_cs1[ 1 ][ 1 ] ); + rgv16_o[ 1 ] = vmlal_n_s16( rgv16_o[ 1 ], v8_b1, rgi16_y262_idct_cs1[ 3 ][ 1 ] ); + rgv16_o[ 1 ] = vmlal_n_s16( rgv16_o[ 1 ], v8_b2, rgi16_y262_idct_cs1[ 5 ][ 1 ] ); + rgv16_o[ 1 ] = vmlal_n_s16( rgv16_o[ 1 ], v8_b3, rgi16_y262_idct_cs1[ 7 ][ 1 ] ); + + rgv16_o[ 2 ] = vmull_n_s16( v8_b0, rgi16_y262_idct_cs1[ 1 ][ 2 ] ); + rgv16_o[ 2 ] = vmlal_n_s16( rgv16_o[ 2 ], v8_b1, rgi16_y262_idct_cs1[ 3 ][ 2 ] ); + rgv16_o[ 2 ] = vmlal_n_s16( rgv16_o[ 2 ], v8_b2, rgi16_y262_idct_cs1[ 5 ][ 2 ] ); + rgv16_o[ 2 ] = vmlal_n_s16( rgv16_o[ 2 ], v8_b3, rgi16_y262_idct_cs1[ 7 ][ 2 ] ); + + rgv16_o[ 3 ] = vmull_n_s16( v8_b0, rgi16_y262_idct_cs1[ 1 ][ 3 ] ); + rgv16_o[ 3 ] = vmlal_n_s16( rgv16_o[ 3 ], v8_b1, rgi16_y262_idct_cs1[ 3 ][ 3 ] ); + rgv16_o[ 3 ] = vmlal_n_s16( rgv16_o[ 3 ], v8_b2, rgi16_y262_idct_cs1[ 5 ][ 3 ] ); + rgv16_o[ 3 ] = vmlal_n_s16( rgv16_o[ 3 ], v8_b3, rgi16_y262_idct_cs1[ 7 ][ 3 ] ); + + v8_b0 = vld1_s16( pi16_block + ( 8 * 2 ) + ( i_j * 4 ) ); + v8_b1 = vld1_s16( pi16_block + ( 8 * 6 ) + ( i_j * 4 ) ); + v8_b2 = vld1_s16( pi16_block + ( 8 * 0 ) + ( i_j * 4 ) ); + v8_b3 = vld1_s16( pi16_block + ( 8 * 4 ) + ( i_j * 4 ) ); + + rgv16_eo[ 0 ] = vmull_n_s16( v8_b0, rgi16_y262_idct_cs1[ 2 ][ 0 ] ); + rgv16_eo[ 0 ] = vmlal_n_s16( rgv16_eo[ 0 ], v8_b1, rgi16_y262_idct_cs1[ 6 ][ 0 ] ); + rgv16_eo[ 1 ] = vmull_n_s16( v8_b0, rgi16_y262_idct_cs1[ 2 ][ 1 ] ); + rgv16_eo[ 1 ] = vmlal_n_s16( rgv16_eo[ 1 ], v8_b1, rgi16_y262_idct_cs1[ 6 ][ 1 ] ); + rgv16_ee[ 0 ] = vmull_n_s16( v8_b2, rgi16_y262_idct_cs1[ 0 ][ 0 ] ); + rgv16_ee[ 0 ] = vmlal_n_s16( rgv16_ee[ 0 ], v8_b3, rgi16_y262_idct_cs1[ 4 ][ 0 ] ); + rgv16_ee[ 1 ] = vmull_n_s16( v8_b2, rgi16_y262_idct_cs1[ 0 ][ 1 ] ); + rgv16_ee[ 1 ] = vmlal_n_s16( rgv16_ee[ 1 ], v8_b3, rgi16_y262_idct_cs1[ 4 ][ 1 ] ); + + rgv16_e[ 0 ] = vaddq_s32( rgv16_ee[ 0 ], rgv16_eo[ 0 ] ); + rgv16_e[ 1 ] = vaddq_s32( rgv16_ee[ 1 ], rgv16_eo[ 1 ] ); + rgv16_e[ 2 ] = vsubq_s32( rgv16_ee[ 1 ], rgv16_eo[ 1 ] ); + rgv16_e[ 3 ] = vsubq_s32( rgv16_ee[ 0 ], rgv16_eo[ 0 ] ); + + + for( i_k = 0; i_k < 4; i_k++ ) + { + int32x4_t v16_eoa, v16_eos; + v16_eoa = vaddq_s32( rgv16_e[ i_k ], rgv16_o[ i_k ]); + rgv8_tmp[ i_k ][ i_j ] = vqrshrn_n_s32( v16_eoa, RND1BITS ); + v16_eos = vsubq_s32( rgv16_e[ 3 - i_k ], rgv16_o[ 3 - i_k ]); + rgv8_tmp[ i_k + 4 ][ i_j ] = vqrshrn_n_s32( v16_eos, RND1BITS ); + } + } + + v8_mt0 = vld1_s16( &rgi16_y262_idct_neon_cs2[ 0 ] ); + v8_mt1 = vld1_s16( &rgi16_y262_idct_neon_cs2[ 4 ] ); + v8_mt2 = vld1_s16( &rgi16_y262_idct_neon_cs2[ 8 ] ); + v8_mt3 = vld1_s16( &rgi16_y262_idct_neon_cs2[ 12 ] ); + v8_mt4 = vld1_s16( &rgi16_y262_idct_neon_cs2[ 16 ] ); + v8_mt5 = vld1_s16( &rgi16_y262_idct_neon_cs2[ 20 ] ); + v8_mt6 = vld1_s16( &rgi16_y262_idct_neon_cs2[ 24 ] ); + v8_mt7 = vld1_s16( &rgi16_y262_idct_neon_cs2[ 28 ] ); + + for( i_j = 0; i_j < 8; i_j++ ) + { + int16x4_t v8_l02, v8_l46, v8_l13, v8_l57, v8_m0, v8_m1, v8_m2, v8_m3, v8_m4, v8_m5, v8_m6, v8_m7; + int16x4x2_t v16_trn0; + int32x2x2_t v16_trn1; + int32x4_t v16_s0, v16_s1, v16_s2, v16_s3, v16_s4, v16_s5, v16_s6, v16_s7, v16_e, v16_o; + int16x8_t v16_row; + + v8_m0 = rgv8_tmp[ i_j ][ 0 ]; + v8_m1 = rgv8_tmp[ i_j ][ 1 ]; + + v16_trn1 = vtrn_s32( vreinterpret_s32_s16( v8_m0 ), vreinterpret_s32_s16( v8_m1 ) ); + v16_trn0 = vzip_s16( vreinterpret_s16_s32( v16_trn1.val[ 0 ] ),vreinterpret_s16_s32( v16_trn1.val[ 1 ] ) ); + + v16_trn1 = vtrn_s32( vreinterpret_s32_s16( v16_trn0.val[ 0 ] ), vreinterpret_s32_s16( v16_trn0.val[ 0 ] ) ); + v8_l02 = vreinterpret_s16_s32( v16_trn1.val[ 0 ] ); + v8_l13 = vreinterpret_s16_s32( v16_trn1.val[ 1 ] ); + + v16_trn1 = vtrn_s32( vreinterpret_s32_s16( v16_trn0.val[ 1 ] ), vreinterpret_s32_s16( v16_trn0.val[ 1 ] ) ); + v8_l46 = vreinterpret_s16_s32( v16_trn1.val[ 0 ] ); + v8_l57 = vreinterpret_s16_s32( v16_trn1.val[ 1 ] ); + + v16_s0 = vmull_s16( v8_l02, v8_mt0 ); + v16_s0 = vmlal_s16( v16_s0, v8_l46, v8_mt2 ); + v16_s1 = vmull_s16( v8_l02, v8_mt1 ); + v16_s1 = vmlal_s16( v16_s1, v8_l46, v8_mt3 ); + v16_s2 = vmull_s16( v8_l13, v8_mt4 ); + v16_s2 = vmlal_s16( v16_s2, v8_l57, v8_mt6 ); + v16_s3 = vmull_s16( v8_l13, v8_mt5 ); + v16_s3 = vmlal_s16( v16_s3, v8_l57, v8_mt7 ); + + v16_s0 = vpaddq_s32( v16_s0, v16_s1 ); + v16_s1 = vpaddq_s32( v16_s2, v16_s3 ); + + v16_e = vaddq_s32( v16_s0, v16_s1 ); + v16_o = vsubq_s32( v16_s0, v16_s1 ); + v16_o = vcombine_s32( vrev64_s32( vget_high_s32( v16_o ) ), vrev64_s32( vget_low_s32( v16_o ) ) ); + + v16_row = vcombine_s16( vmovn_s32( vrshrq_n_s32( v16_e, RND2BITS ) ), vmovn_s32(vrshrq_n_s32( v16_o, RND2BITS ) ) ); + vst1q_s16( pi16_dst + ( 8 * i_j ), v16_row ); + } +} + + +int32_t y262_quant8x8_intra_fw_mpeg2_neon( int16_t *pi_coeffs, int32_t i_stride, uint16_t *pui16_qmat, uint16_t *pui16_bias ) +{ + int32_t i_y, i_x, i_qm, i_nz, i_intra_dc; + int16x8_t v16_zero, v16_nz, v16_2047, v16_m2048; + + v16_nz = v16_zero = vmovq_n_s16( 0 ); + v16_2047 = vmovq_n_s16( 2047 ); + v16_m2048 = vmovq_n_s16( -2048 ); + + i_intra_dc = pi_coeffs[ 0 ]; + pi_coeffs[ 0 ] = 0; + + i_nz = 0; + for( i_y = 0; i_y < 8; i_y++ ) + { + int16x8_t v16_co; + int16x8_t v16_mask; + uint16x8_t v16_qm, v16_bias, v16_cou; + uint32x4_t v16_res0, v16_res1; + + v16_co = vld1q_s16( pi_coeffs + ( i_y * 8 ) ); + v16_qm = vld1q_u16( pui16_qmat + ( i_y * 8 ) ); + v16_bias = vld1q_u16( pui16_bias + ( i_y * 8 ) ); + v16_mask = vreinterpretq_s16_u16( vcgtq_s16( v16_zero, v16_co ) ); + v16_co = veorq_s16( v16_co, v16_mask ); + v16_co = vsubq_s16( v16_co, v16_mask ); + + v16_cou = vaddq_u16( vreinterpretq_u16_s16( v16_co ), v16_bias ); + v16_res0 = vmull_u16( vget_low_u16( v16_cou ), vget_low_u16( v16_qm ) ); + v16_res1 = vmull_u16( vget_high_u16( v16_cou ), vget_high_u16( v16_qm ) ); + + v16_co = vreinterpretq_s16_u16( vcombine_u16( vshrn_n_u32( v16_res0, 16 ), vshrn_n_u32( v16_res1, 16 ) ) ); + + v16_co = veorq_s16( v16_co, v16_mask ); + v16_co = vsubq_s16( v16_co, v16_mask ); + + v16_co = vminq_s16( v16_co, v16_2047 ); + v16_co = vmaxq_s16( v16_co, v16_m2048 ); + + vst1q_s16( pi_coeffs + ( i_y * 8 ), v16_co ); + + v16_nz = vorrq_s16( v16_co, v16_nz ); + } + + v16_nz = vnegq_s16( vreinterpretq_s16_u16( vceqzq_s16( v16_nz ) ) ); + i_nz = vgetq_lane_u32( vreinterpretq_u32_s16( v16_nz ), 0 ); + i_nz |= vgetq_lane_u32( vreinterpretq_u32_s16( v16_nz ), 1 ); + i_nz |= vgetq_lane_u32( vreinterpretq_u32_s16( v16_nz ), 2 ); + i_nz |= vgetq_lane_u32( vreinterpretq_u32_s16( v16_nz ), 3 ); + + pi_coeffs[ 0 ] = i_intra_dc; + + return i_nz; +} + + + +int32_t y262_quant8x8_inter_fw_mpeg2_neon( int16_t *pi_coeffs, int32_t i_stride, uint16_t *pui16_qmat ) +{ + int32_t i_y, i_x, i_qm, i_nz; + int16x8_t v16_zero, v16_nz, v16_2047, v16_m2048; + + v16_nz = v16_zero = vmovq_n_s16( 0 ); + v16_2047 = vmovq_n_s16( 2047 ); + v16_m2048 = vmovq_n_s16( -2048 ); + + i_nz = 0; + for( i_y = 0; i_y < 8; i_y++ ) + { + int16x8_t v16_co; + int16x8_t v16_mask; + uint16x8_t v16_qm; + uint32x4_t v16_res0, v16_res1; + + v16_co = vld1q_s16( pi_coeffs + ( i_y * 8 ) ); + v16_qm = vld1q_u16( pui16_qmat + ( i_y * 8 ) ); + v16_mask = vreinterpretq_s16_u16( vcgtq_s16( v16_zero, v16_co ) ); + v16_co = veorq_s16( v16_co, v16_mask ); + v16_co = vsubq_s16( v16_co, v16_mask ); + + v16_res0 = vmull_u16( vreinterpret_u16_s16( vget_low_s16( v16_co ) ), vget_low_u16( v16_qm ) ); + v16_res1 = vmull_u16( vreinterpret_u16_s16( vget_high_s16( v16_co ) ), vget_high_u16( v16_qm ) ); + + v16_co = vreinterpretq_s16_u16( vcombine_u16( vshrn_n_u32( v16_res0, 16 ), vshrn_n_u32( v16_res1, 16 ) ) ); + + v16_co = veorq_s16( v16_co, v16_mask ); + v16_co = vsubq_s16( v16_co, v16_mask ); + + v16_co = vminq_s16( v16_co, v16_2047 ); + v16_co = vmaxq_s16( v16_co, v16_m2048 ); + + vst1q_s16( pi_coeffs + ( i_y * 8 ), v16_co ); + + v16_nz = vorrq_s16( v16_co, v16_nz ); + } + + v16_nz = vnegq_s16( vreinterpretq_s16_u16( vceqzq_s16( v16_nz ) ) ); + i_nz = vgetq_lane_u32( vreinterpretq_u32_s16( v16_nz ), 0 ); + i_nz |= vgetq_lane_u32( vreinterpretq_u32_s16( v16_nz ), 1 ); + i_nz |= vgetq_lane_u32( vreinterpretq_u32_s16( v16_nz ), 2 ); + i_nz |= vgetq_lane_u32( vreinterpretq_u32_s16( v16_nz ), 3 ); + + return i_nz; +} + + + +//#endif \ No newline at end of file diff --git a/src/y262/transform_arm64.h b/src/y262/transform_arm64.h new file mode 100644 index 0000000..6fafe07 --- /dev/null +++ b/src/y262/transform_arm64.h @@ -0,0 +1,36 @@ +/* +Copyright (c) 2013, Ralf Willenbacher +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +*/ + +void y262_fdct_neon( int16_t *pi16_block, int16_t *pi16_dst ); +void y262_idct_neon( int16_t *pi16_block, int16_t *pi16_dst ); + +int32_t y262_quant8x8_intra_fw_mpeg2_neon( int16_t *pi_coeffs, int32_t i_stride, uint16_t *pui16_qmat, uint16_t *pui16_bias ); +int32_t y262_quant8x8_inter_fw_mpeg2_neon( int16_t *pi_coeffs, int32_t i_stride, uint16_t *pui16_qmat ); + diff --git a/src/y262/y262.h b/src/y262/y262.h index 309aef9..49d642a 100644 --- a/src/y262/y262.h +++ b/src/y262/y262.h @@ -59,11 +59,19 @@ POSSIBILITY OF SUCH DAMAGE. #include "transform.h" #include "pixelop.h" #include "me.h" -#include "transform_x86.h" -#include "pixelop_x86.h" #include "ratectrl.h" #include "threads.h" +#ifdef ASSEMBLY_X86 +#include "transform_x86.h" +#include "pixelop_x86.h" +#endif + +#ifdef ASSEMBLY_ARM64 +#include "transform_arm64.h" +#include "pixelop_arm64.h" +#endif + void y262_init_motion_compensation( y262_t *ps_y262 ); void y262_error( y262_t *ps_y262, int32_t i_error_code, int8_t* pi8_format, ... ); void y262_encode_picture( y262_t *ps_y262, y262_picture_t *ps_picture, int32_t i_picture_type, int32_t i_pon ); diff --git a/src/y262/y262api.c b/src/y262/y262api.c index be4ef48..7f150c8 100644 --- a/src/y262/y262api.c +++ b/src/y262/y262api.c @@ -548,6 +548,7 @@ int32_t y262_initialize( void *p_y262, y262_configuration_t *ps_config ) ps_y262->s_funcs.f_fdct_8x8 = y262_fdct_c; ps_y262->s_funcs.f_idct_8x8 = y262_idct_c; +#ifdef ASSEMBLY_X86 if( 1 ) { ps_y262->s_funcs.rgf_sad[ BLOCK_TYPE_16x16 ] = y262_sad_16x16_sse2; @@ -565,7 +566,29 @@ int32_t y262_initialize( void *p_y262, y262_configuration_t *ps_config ) ps_y262->s_funcs.f_fdct_8x8 = y262_fdct_sse2; ps_y262->s_funcs.f_idct_8x8 = y262_idct_sse2; } +#endif +#ifdef ASSEMBLY_ARM64 + if( 1 ) + { + ps_y262->s_funcs.rgf_sad[ BLOCK_TYPE_16x16 ] = y262_sad_16x16_neon; + ps_y262->s_funcs.rgf_sad[ BLOCK_TYPE_16x8 ] = y262_sad_16x8_neon; + ps_y262->s_funcs.rgf_satd[ BLOCK_TYPE_16x16 ] = y262_satd_16x16_neon; + ps_y262->s_funcs.rgf_satd[ BLOCK_TYPE_16x8 ] = y262_satd_16x8_neon; + + ps_y262->s_funcs.f_ssd_16x16 = y262_ssd_16x16_neon; + ps_y262->s_funcs.f_ssd_8x8 = y262_ssd_8x8_neon; + ps_y262->s_funcs.f_add_8x8 = y262_add_8x8_neon; + ps_y262->s_funcs.f_sub_8x8 = y262_sub_8x8_neon; + ps_y262->s_funcs.f_quant8x8_intra_fw = y262_quant8x8_intra_fw_mpeg2_neon; + ps_y262->s_funcs.f_quant8x8_inter_fw = y262_quant8x8_inter_fw_mpeg2_neon; + + ps_y262->s_funcs.f_fdct_8x8 = y262_fdct_neon; + ps_y262->s_funcs.f_idct_8x8 = y262_idct_neon; + + + } +#endif memset( ps_y262->rgi_y262_motion_bits_x, 0, sizeof( ps_y262->rgi_y262_motion_bits_x ) ); memset( ps_y262->rgi_y262_motion_bits_y, 1, sizeof( ps_y262->rgi_y262_motion_bits_y ) ); diff --git a/src/y262app/CMakeLists.txt b/src/y262app/CMakeLists.txt index 604df94..a39d634 100644 --- a/src/y262app/CMakeLists.txt +++ b/src/y262app/CMakeLists.txt @@ -1,12 +1,6 @@ cmake_minimum_required(VERSION 3.1) project(y262app) -if(CMAKE_SIZEOF_VOID_P EQUAL 8) - set(ARCH "_x64") -else() - set(ARCH "_x86") -endif() - set( SRC_FILES ${CMAKE_CURRENT_SOURCE_DIR}/main.c ) @@ -14,7 +8,7 @@ set( SRC_FILES add_executable(y262app ${SRC_FILES}) target_link_libraries(y262app liby262) set_target_properties(y262app PROPERTIES - OUTPUT_NAME "y262$<$:d>${ARCH}" + OUTPUT_NAME "y262$<$:d>" RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/bin" POSITION_INDEPENDENT_CODE ON )