removed x86 specific stuff from cmake, added arm64 intrinsic functions. cmake should now work on all platforms.

2021-04-25 11:46:13 +02:00 · 2021-04-25 11:46:13 +02:00 · 96bbc6c07e
commit 96bbc6c07e
parent 068ffc674c
10 changed files with 1527 additions and 43 deletions
--- a/src/y262/CMakeLists.txt
+++ b/src/y262/CMakeLists.txt
@ -4,32 +4,45 @@ project(liby262)
 find_package(Threads)
 find_program(YASM_EXE NAMES yasm)

-if(CMAKE_SIZEOF_VOID_P EQUAL 8)
-	set(ARCH "_x64")
-	if(WIN32)
-		set(YASM_ARGS -f win32 -m amd64 -DARCH_X86_64 -DPIC)
-	elseif(APPLE)
-		set(YASM_ARGS -f macho64 -m amd64 -DARCH_X86_64 -DPIC --prefix=_)
-	else()
-		set(YASM_ARGS -f elf64 -m amd64 -DARCH_X86_64 -DPIC)
-	endif()
-else()
-	set(ARCH "_x86")
-	if(WIN32)
-		set(YASM_ARGS -f win32 --prefix=_)
-	elseif(APPLE)
-		set(YASM_ARGS -f macho32 --prefix=_)
-	else()
-		set(YASM_ARGS -f elf32)
-	endif()
+message( "architecture: ${CMAKE_SYSTEM_PROCESSOR}")
+
+set(Y262_TARGET_ARCH "unknown")
+if(CMAKE_SYSTEM_PROCESSOR MATCHES "amd64.*|x86_64.*|AMD64.*")
+	set(Y262_TARGET_ARCH "intelx86")
+elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "i686.*|x86.*|i386.*")
+	set(Y262_TARGET_ARCH "intelx86")
+elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(arm64.*|aarch64.*|ARM64.*|AARCH64.*)")
+	set(Y262_TARGET_ARCH "arm64")
 endif()

-add_custom_command(OUTPUT pixelop_x86.o COMMAND ${YASM_EXE}
-	ARGS ${YASM_ARGS} -o ${CMAKE_CURRENT_BINARY_DIR}/pixelop_x86.o ${CMAKE_CURRENT_SOURCE_DIR}/pixelop_x86.asm)
-add_custom_command(OUTPUT transform_x86.o COMMAND ${YASM_EXE}
-	ARGS ${YASM_ARGS} -o ${CMAKE_CURRENT_BINARY_DIR}/transform_x86.o ${CMAKE_CURRENT_SOURCE_DIR}/transform_x86.asm)
+message( "target_arch: ${Y262_TARGET_ARCH}")

-add_library(liby262 STATIC
+if(Y262_TARGET_ARCH MATCHES "intelx86")
+	if(CMAKE_SIZEOF_VOID_P EQUAL 8)
+		if(WIN32)
+			set(YASM_ARGS -f win32 -m amd64 -DARCH_X86_64 -DPIC)
+		elseif(APPLE)
+			set(YASM_ARGS -f macho64 -m amd64 -DARCH_X86_64 -DPIC --prefix=_)
+		else()
+			set(YASM_ARGS -f elf64 -m amd64 -DARCH_X86_64 -DPIC)
+		endif()
+	else()
+		if(WIN32)
+			set(YASM_ARGS -f win32 --prefix=_)
+		elseif(APPLE)
+			set(YASM_ARGS -f macho32 --prefix=_)
+		else()
+			set(YASM_ARGS -f elf32)
+		endif()
+	endif()
+
+	add_custom_command(OUTPUT pixelop_x86.o COMMAND ${YASM_EXE}
+		ARGS ${YASM_ARGS} -o ${CMAKE_CURRENT_BINARY_DIR}/pixelop_x86.o ${CMAKE_CURRENT_SOURCE_DIR}/pixelop_x86.asm)
+	add_custom_command(OUTPUT transform_x86.o COMMAND ${YASM_EXE}
+		ARGS ${YASM_ARGS} -o ${CMAKE_CURRENT_BINARY_DIR}/transform_x86.o ${CMAKE_CURRENT_SOURCE_DIR}/transform_x86.asm)
+endif()
+
+set(liby262_sources_basic
    ${CMAKE_CURRENT_SOURCE_DIR}/aboveslicelevel.h
    ${CMAKE_CURRENT_SOURCE_DIR}/bitstream.h
    ${CMAKE_CURRENT_SOURCE_DIR}/lookahead.h
@ -56,17 +69,36 @@ add_library(liby262 STATIC
    ${CMAKE_CURRENT_SOURCE_DIR}/transform.c
    ${CMAKE_CURRENT_SOURCE_DIR}/y262.c
    ${CMAKE_CURRENT_SOURCE_DIR}/y262api.c
-
-	${CMAKE_CURRENT_BINARY_DIR}/pixelop_x86.o
-	${CMAKE_CURRENT_BINARY_DIR}/transform_x86.o
 )

-set_target_properties(liby262 PROPERTIES 
-	OUTPUT_NAME "liby262$<$<CONFIG:Debug>:d>${ARCH}"
-	ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib"
-	LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib"
+set(liby262_sources_assembly "")
+
+if(Y262_TARGET_ARCH MATCHES "intelx86")
+	set(liby262_sources_assembly
+		${CMAKE_CURRENT_BINARY_DIR}/pixelop_x86.o
+		${CMAKE_CURRENT_BINARY_DIR}/transform_x86.o
+	)
+elseif(Y262_TARGET_ARCH MATCHES "arm64")
+	set(liby262_sources_assembly
+		${CMAKE_CURRENT_SOURCE_DIR}/transform_arm64.c
+		${CMAKE_CURRENT_SOURCE_DIR}/transform_arm64.h
+		${CMAKE_CURRENT_SOURCE_DIR}/pixelop_arm64.c
+		${CMAKE_CURRENT_SOURCE_DIR}/pixelop_arm64.h
+	)
+endif()
+
+add_library(liby262 STATIC
+	${liby262_sources_basic}
+	${liby262_sources_assembly}
 )
-	
+
+target_include_directories(liby262 PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
+
+if(Y262_TARGET_ARCH MATCHES "intelx86")
+	add_compile_definitions(ASSEMBLY_X86)
+elseif(Y262_TARGET_ARCH MATCHES "arm64")
+	add_compile_definitions(ASSEMBLY_ARM64)
+endif()

 if(WIN32)
 	target_compile_definitions(liby262 PRIVATE WIN32)
@ -77,8 +109,12 @@ else()
 	target_link_libraries(liby262 PUBLIC m)
 endif()

-set_target_properties(liby262 PROPERTIES POSITION_INDEPENDENT_CODE ON)
+set_target_properties(liby262 PROPERTIES 
+	OUTPUT_NAME "liby262$<$<CONFIG:Debug>:d>${MY_ARCH}"
+	ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib"
+	LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib"
+	POSITION_INDEPENDENT_CODE ON
+)

-target_include_directories(liby262 PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})

 target_link_libraries(liby262 PUBLIC Threads::Threads)
--- a/src/y262/mc.c
+++ b/src/y262/mc.c
@ -30,6 +30,8 @@ POSSIBILITY OF SUCH DAMAGE.

 #include "y262.h"

+#ifdef ASSEMBLY_X86
+
 void y262_motcomp_16x16_00_put_sse2( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride );
 void y262_motcomp_16x16_01_put_sse2( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride );
 void y262_motcomp_16x16_10_put_sse2( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride );
@ -80,6 +82,65 @@ void y262_motcomp_8x4_01_avg_mmxext( uint8_t *pui8_src, int32_t i_src_stride, ui
 void y262_motcomp_8x4_10_avg_mmxext( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride );
 void y262_motcomp_8x4_11_avg_mmxext( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride );

+#endif
+
+
+#ifdef ASSEMBLY_ARM64
+
+void y262_motcomp_16x16_00_put_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride );
+void y262_motcomp_16x16_01_put_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride );
+void y262_motcomp_16x16_10_put_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride );
+void y262_motcomp_16x16_11_put_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride );
+
+void y262_motcomp_16x8_00_put_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride );
+void y262_motcomp_16x8_01_put_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride );
+void y262_motcomp_16x8_10_put_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride );
+void y262_motcomp_16x8_11_put_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride );
+
+void y262_motcomp_8x16_00_put_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride );
+void y262_motcomp_8x16_01_put_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride );
+void y262_motcomp_8x16_10_put_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride );
+void y262_motcomp_8x16_11_put_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride );
+
+void y262_motcomp_8x8_00_put_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride );
+void y262_motcomp_8x8_01_put_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride );
+void y262_motcomp_8x8_10_put_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride );
+void y262_motcomp_8x8_11_put_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride );
+
+void y262_motcomp_8x4_00_put_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride );
+void y262_motcomp_8x4_01_put_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride );
+void y262_motcomp_8x4_10_put_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride );
+void y262_motcomp_8x4_11_put_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride );
+
+void y262_motcomp_16x16_00_avg_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride );
+void y262_motcomp_16x16_01_avg_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride );
+void y262_motcomp_16x16_10_avg_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride );
+void y262_motcomp_16x16_11_avg_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride );
+
+void y262_motcomp_16x8_00_avg_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride );
+void y262_motcomp_16x8_01_avg_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride );
+void y262_motcomp_16x8_10_avg_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride );
+void y262_motcomp_16x8_11_avg_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride );
+
+void y262_motcomp_8x16_00_avg_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride );
+void y262_motcomp_8x16_01_avg_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride );
+void y262_motcomp_8x16_10_avg_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride );
+void y262_motcomp_8x16_11_avg_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride );
+
+void y262_motcomp_8x8_00_avg_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride );
+void y262_motcomp_8x8_01_avg_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride );
+void y262_motcomp_8x8_10_avg_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride );
+void y262_motcomp_8x8_11_avg_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride );
+
+void y262_motcomp_8x4_00_avg_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride );
+void y262_motcomp_8x4_01_avg_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride );
+void y262_motcomp_8x4_10_avg_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride );
+void y262_motcomp_8x4_11_avg_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride );
+
+#endif
+
+
+



@ -322,7 +383,7 @@ void y262_init_motion_compensation( y262_t *ps_y262 )
 	ps_y262->s_funcs.rgf_motcomp_avg[ MC_BLOCK_8x4 ][ MC_BLOCK_10 ] = y262_motcomp_8x4_10_avg;
 	ps_y262->s_funcs.rgf_motcomp_avg[ MC_BLOCK_8x4 ][ MC_BLOCK_11 ] = y262_motcomp_8x4_11_avg;

-#if 1
+#ifdef ASSEMBLY_X86

 	if( 1 )
 	{
@ -382,6 +443,70 @@ void y262_init_motion_compensation( y262_t *ps_y262 )
 		ps_y262->s_funcs.rgf_motcomp_avg[ MC_BLOCK_8x4 ][ MC_BLOCK_11 ] = y262_motcomp_8x4_11_avg_mmxext;		
 	}
 #endif
+
+#ifdef ASSEMBLY_ARM64
+
+	if( 1 )
+	{
+		/* copy */
+		ps_y262->s_funcs.rgf_motcomp_copy[ MC_BLOCK_16x16 ][ MC_BLOCK_00 ] = y262_motcomp_16x16_00_put_neon;
+		ps_y262->s_funcs.rgf_motcomp_copy[ MC_BLOCK_16x16 ][ MC_BLOCK_01 ] = y262_motcomp_16x16_01_put_neon;
+		ps_y262->s_funcs.rgf_motcomp_copy[ MC_BLOCK_16x16 ][ MC_BLOCK_10 ] = y262_motcomp_16x16_10_put_neon;
+		ps_y262->s_funcs.rgf_motcomp_copy[ MC_BLOCK_16x16 ][ MC_BLOCK_11 ] = y262_motcomp_16x16_11_put_neon;
+
+		ps_y262->s_funcs.rgf_motcomp_copy[ MC_BLOCK_16x8 ][ MC_BLOCK_00 ] = y262_motcomp_16x8_00_put_neon;
+		ps_y262->s_funcs.rgf_motcomp_copy[ MC_BLOCK_16x8 ][ MC_BLOCK_01 ] = y262_motcomp_16x8_01_put_neon;
+		ps_y262->s_funcs.rgf_motcomp_copy[ MC_BLOCK_16x8 ][ MC_BLOCK_10 ] = y262_motcomp_16x8_10_put_neon;
+		ps_y262->s_funcs.rgf_motcomp_copy[ MC_BLOCK_16x8 ][ MC_BLOCK_11 ] = y262_motcomp_16x8_11_put_neon;
+		
+		ps_y262->s_funcs.rgf_motcomp_copy[ MC_BLOCK_8x16 ][ MC_BLOCK_00 ] = y262_motcomp_8x16_00_put_neon;
+		ps_y262->s_funcs.rgf_motcomp_copy[ MC_BLOCK_8x16 ][ MC_BLOCK_01 ] = y262_motcomp_8x16_01_put_neon;
+		ps_y262->s_funcs.rgf_motcomp_copy[ MC_BLOCK_8x16 ][ MC_BLOCK_10 ] = y262_motcomp_8x16_10_put_neon;
+		ps_y262->s_funcs.rgf_motcomp_copy[ MC_BLOCK_8x16 ][ MC_BLOCK_11 ] = y262_motcomp_8x16_11_put_neon;
+
+		ps_y262->s_funcs.rgf_motcomp_copy[ MC_BLOCK_8x8 ][ MC_BLOCK_00 ] = y262_motcomp_8x8_00_put_neon;
+		ps_y262->s_funcs.rgf_motcomp_copy[ MC_BLOCK_8x8 ][ MC_BLOCK_01 ] = y262_motcomp_8x8_01_put_neon;
+		ps_y262->s_funcs.rgf_motcomp_copy[ MC_BLOCK_8x8 ][ MC_BLOCK_10 ] = y262_motcomp_8x8_10_put_neon;
+		ps_y262->s_funcs.rgf_motcomp_copy[ MC_BLOCK_8x8 ][ MC_BLOCK_11 ] = y262_motcomp_8x8_11_put_neon;
+
+		ps_y262->s_funcs.rgf_motcomp_copy[ MC_BLOCK_8x4 ][ MC_BLOCK_00 ] = y262_motcomp_8x4_00_put_neon;
+		ps_y262->s_funcs.rgf_motcomp_copy[ MC_BLOCK_8x4 ][ MC_BLOCK_01 ] = y262_motcomp_8x4_01_put_neon;
+		ps_y262->s_funcs.rgf_motcomp_copy[ MC_BLOCK_8x4 ][ MC_BLOCK_10 ] = y262_motcomp_8x4_10_put_neon;
+		ps_y262->s_funcs.rgf_motcomp_copy[ MC_BLOCK_8x4 ][ MC_BLOCK_11 ] = y262_motcomp_8x4_11_put_neon;
+		
+
+
+		/* avg */
+		ps_y262->s_funcs.rgf_motcomp_avg[ MC_BLOCK_16x16 ][ MC_BLOCK_00 ] = y262_motcomp_16x16_00_avg_neon;
+		ps_y262->s_funcs.rgf_motcomp_avg[ MC_BLOCK_16x16 ][ MC_BLOCK_01 ] = y262_motcomp_16x16_01_avg_neon;
+		ps_y262->s_funcs.rgf_motcomp_avg[ MC_BLOCK_16x16 ][ MC_BLOCK_10 ] = y262_motcomp_16x16_10_avg_neon;
+		ps_y262->s_funcs.rgf_motcomp_avg[ MC_BLOCK_16x16 ][ MC_BLOCK_11 ] = y262_motcomp_16x16_11_avg_neon;
+		
+		ps_y262->s_funcs.rgf_motcomp_avg[ MC_BLOCK_16x8 ][ MC_BLOCK_00 ] = y262_motcomp_16x8_00_avg_neon;
+		ps_y262->s_funcs.rgf_motcomp_avg[ MC_BLOCK_16x8 ][ MC_BLOCK_01 ] = y262_motcomp_16x8_01_avg_neon;
+		ps_y262->s_funcs.rgf_motcomp_avg[ MC_BLOCK_16x8 ][ MC_BLOCK_10 ] = y262_motcomp_16x8_10_avg_neon;
+		ps_y262->s_funcs.rgf_motcomp_avg[ MC_BLOCK_16x8 ][ MC_BLOCK_11 ] = y262_motcomp_16x8_11_avg_neon;
+		
+		ps_y262->s_funcs.rgf_motcomp_avg[ MC_BLOCK_8x16 ][ MC_BLOCK_00 ] = y262_motcomp_8x16_00_avg_neon;
+		ps_y262->s_funcs.rgf_motcomp_avg[ MC_BLOCK_8x16 ][ MC_BLOCK_01 ] = y262_motcomp_8x16_01_avg_neon;
+		ps_y262->s_funcs.rgf_motcomp_avg[ MC_BLOCK_8x16 ][ MC_BLOCK_10 ] = y262_motcomp_8x16_10_avg_neon;
+		ps_y262->s_funcs.rgf_motcomp_avg[ MC_BLOCK_8x16 ][ MC_BLOCK_11 ] = y262_motcomp_8x16_11_avg_neon;
+
+		ps_y262->s_funcs.rgf_motcomp_avg[ MC_BLOCK_8x8 ][ MC_BLOCK_00 ] = y262_motcomp_8x8_00_avg_neon;
+		ps_y262->s_funcs.rgf_motcomp_avg[ MC_BLOCK_8x8 ][ MC_BLOCK_01 ] = y262_motcomp_8x8_01_avg_neon;
+		ps_y262->s_funcs.rgf_motcomp_avg[ MC_BLOCK_8x8 ][ MC_BLOCK_10 ] = y262_motcomp_8x8_10_avg_neon;
+		ps_y262->s_funcs.rgf_motcomp_avg[ MC_BLOCK_8x8 ][ MC_BLOCK_11 ] = y262_motcomp_8x8_11_avg_neon;
+
+		ps_y262->s_funcs.rgf_motcomp_avg[ MC_BLOCK_8x4 ][ MC_BLOCK_00 ] = y262_motcomp_8x4_00_avg_neon;
+		ps_y262->s_funcs.rgf_motcomp_avg[ MC_BLOCK_8x4 ][ MC_BLOCK_01 ] = y262_motcomp_8x4_01_avg_neon;
+		ps_y262->s_funcs.rgf_motcomp_avg[ MC_BLOCK_8x4 ][ MC_BLOCK_10 ] = y262_motcomp_8x4_10_avg_neon;
+		ps_y262->s_funcs.rgf_motcomp_avg[ MC_BLOCK_8x4 ][ MC_BLOCK_11 ] = y262_motcomp_8x4_11_avg_neon;		
+
+	}
+#endif
+
+
+
 }


--- a/src/y262/pixelop.c
+++ b/src/y262/pixelop.c
@ -226,6 +226,8 @@ int32_t y262_satd_16x8( uint8_t *pui8_blk1, int32_t i_stride1, uint8_t *pui8_blk
 	return i_satd;
 }

+#ifdef ASSEMBLY_X86
+
 int32_t y262_satd_16x16_sse2( uint8_t *pui8_blk1, int32_t i_stride1, uint8_t *pui8_blk2, int32_t i_stride2 )
 {
 	int32_t i_satd;
@ -249,6 +251,7 @@ int32_t y262_satd_16x8_sse2( uint8_t *pui8_blk1, int32_t i_stride1, uint8_t *pui
 	return i_satd;
 }

+#endif

 int32_t y262_ssd_16x16( uint8_t *pui8_blk1, int32_t i_blk1_stride, uint8_t *pui8_blk2, int32_t i_blk2_stride )
 {
--- a/src/y262/pixelop_arm64.c
+++ b/src/y262/pixelop_arm64.c
@ -0,0 +1,773 @@
+/*
+Copyright (c) 2013, Ralf Willenbacher
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in
+   the documentation and/or other materials provided with the
+   distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include <arm_neon.h>
+
+#include "y262.h"
+
+int32_t y262_sad_16x8_neon( uint8_t *pui8_blk1, int32_t i_stride1, uint8_t *pui8_blk2, int32_t i_stride2 )
+{
+	int32_t i_sad, i_y;
+    int64_t i64_sad;
+    uint8x16_t v16_blk1, v16_blk2;
+    uint8x8_t v8_a, v8_b;
+    uint16x8_t v16_res0, v16_res1;
+    uint32x4_t v16_hadd0;
+    uint32x2_t v8_hadd1;
+    uint64x1_t v8_hadd2;
+    
+    v16_blk1 = vld1q_u8 ( pui8_blk1 );
+    pui8_blk1 += i_stride1;
+    v16_blk2 = vld1q_u8 ( pui8_blk2 );
+    pui8_blk2 += i_stride2;
+    v8_a = vget_low_u8( v16_blk1 );
+    v8_b = vget_low_u8( v16_blk2 );
+    v16_res0 = vabdl_u8 ( v8_a, v8_b );
+    v8_a = vget_high_u8( v16_blk1 );
+    v8_b = vget_high_u8( v16_blk2 );
+    v16_res1 = vabdl_u8 ( v8_a, v8_b );
+
+    for( i_y = 1; i_y < 8; i_y++ )
+    {
+        v16_blk1 = vld1q_u8 ( pui8_blk1 );
+        pui8_blk1 += i_stride1;
+        v16_blk2 = vld1q_u8 ( pui8_blk2 );
+        pui8_blk2 += i_stride2;
+        v8_a = vget_low_u8( v16_blk1 );
+        v8_b = vget_low_u8( v16_blk2 );
+        v16_res0 = vabal_u8 ( v16_res0, v8_a, v8_b );
+        v8_a = vget_high_u8( v16_blk1 );
+        v8_b = vget_high_u8( v16_blk2 );
+        v16_res1 = vabal_u8 ( v16_res1, v8_a, v8_b );
+    }
+
+    v16_res0 = vaddq_u16( v16_res0, v16_res1 );
+    v16_hadd0 = vpaddlq_u16( v16_res0 );
+    v8_hadd1 = vadd_u32( vget_low_u32( v16_hadd0 ), vget_high_u32( v16_hadd0 ) );
+    v8_hadd2 = vpaddl_u32( v8_hadd1 );
+
+    i64_sad = vget_lane_u64( v8_hadd2, 0 );
+    i_sad = ( int32_t )i64_sad;
+
+    return i_sad;
+}
+
+
+int32_t y262_sad_16x16_neon( uint8_t *pui8_blk1, int32_t i_stride1, uint8_t *pui8_blk2, int32_t i_stride2 )
+{
+	int32_t i_sad, i_y;
+    int64_t i64_sad;
+    uint8x16_t v16_blk1, v16_blk2;
+    uint8x8_t v8_a, v8_b;
+    uint16x8_t v16_res0, v16_res1;
+    uint32x4_t v16_hadd0;
+    uint32x2_t v8_hadd1;
+    uint64x1_t v8_hadd2;
+    
+    v16_blk1 = vld1q_u8 ( pui8_blk1 );
+    pui8_blk1 += i_stride1;
+    v16_blk2 = vld1q_u8 ( pui8_blk2 );
+    pui8_blk2 += i_stride2;
+    v8_a = vget_low_u8( v16_blk1 );
+    v8_b = vget_low_u8( v16_blk2 );
+    v16_res0 = vabdl_u8 ( v8_a, v8_b );
+    v8_a = vget_high_u8( v16_blk1 );
+    v8_b = vget_high_u8( v16_blk2 );
+    v16_res1 = vabdl_u8 ( v8_a, v8_b );
+
+    for( i_y = 1; i_y < 16; i_y++ )
+    {
+        v16_blk1 = vld1q_u8 ( pui8_blk1 );
+        pui8_blk1 += i_stride1;
+        v16_blk2 = vld1q_u8 ( pui8_blk2 );
+        pui8_blk2 += i_stride2;
+        v8_a = vget_low_u8( v16_blk1 );
+        v8_b = vget_low_u8( v16_blk2 );
+        v16_res0 = vabal_u8 ( v16_res0, v8_a, v8_b );
+        v8_a = vget_high_u8( v16_blk1 );
+        v8_b = vget_high_u8( v16_blk2 );
+        v16_res1 = vabal_u8 ( v16_res1, v8_a, v8_b );
+    }
+
+    v16_res0 = vaddq_u16( v16_res0, v16_res1 );
+    v16_hadd0 = vpaddlq_u16( v16_res0 );
+    v8_hadd1 = vadd_u32( vget_low_u32( v16_hadd0 ), vget_high_u32( v16_hadd0 ) );
+    v8_hadd2 = vpaddl_u32( v8_hadd1 );
+
+    i64_sad = vget_lane_u64( v8_hadd2, 0 );
+    i_sad = ( int32_t )i64_sad;
+
+    return i_sad;
+}
+
+
+#define HADAMARD_NEON_4x2( d0, d1, d2, d3 )                  \
+    {                                                        \
+        int32x4x2_t v32_a0, v32_b0;                          \
+        int16x8x2_t v32_a1, v32_b1;                          \
+                                                             \
+        d0 = vaddq_s16( d0, d1 );                            \
+        d2 = vaddq_s16( d2, d3 );                            \
+                                                             \
+        d1 = vaddq_s16( d1, d1 );                            \
+        d3 = vaddq_s16( d3, d3 );                            \
+        d1 = vsubq_s16( d1, d0 );                            \
+        d3 = vsubq_s16( d3, d2 );                            \
+                                                             \
+        d0 = vaddq_s16( d0, d2 );                            \
+        d1 = vaddq_s16( d1, d3 );                            \
+                                                             \
+        d2 = vaddq_s16( d2, d2 );                            \
+        d3 = vaddq_s16( d3, d3 );                            \
+        d2 = vsubq_s16( d2, d0 );                            \
+        d3 = vsubq_s16( d3, d1 );                            \
+                                                             \
+        v32_a0 = vtrnq_s32( vreinterpretq_s32_s16( d0 ), vreinterpretq_s32_s16( d2 ) );  \
+        v32_b0 = vtrnq_s32( vreinterpretq_s32_s16( d1 ), vreinterpretq_s32_s16( d3 ) );  \
+        v32_a1 = vtrnq_s16( vreinterpretq_s16_s32( v32_a0.val[ 0 ] ), vreinterpretq_s16_s32( v32_b0.val[ 0 ] ) );  \
+        v32_b1 = vtrnq_s16( vreinterpretq_s16_s32( v32_a0.val[ 1 ] ), vreinterpretq_s16_s32( v32_b0.val[ 1 ] ) );  \
+        d0 = vcombine_s16( vget_low_s16( v32_a1.val[ 0 ] ), vget_high_s16( v32_a1.val[ 0 ] ) );    \
+        d1 = vcombine_s16( vget_low_s16( v32_a1.val[ 1 ] ), vget_high_s16( v32_a1.val[ 1 ] ) );    \
+        d2 = vcombine_s16( vget_low_s16( v32_b1.val[ 1 ] ), vget_high_s16( v32_b1.val[ 1 ] ) );    \
+        d3 = vcombine_s16( vget_low_s16( v32_b1.val[ 0 ] ), vget_high_s16( v32_b1.val[ 0 ] ) );    \
+                                                             \
+        d0 = vaddq_s16( d0, d1 );                            \
+        d2 = vaddq_s16( d2, d3 );                            \
+                                                             \
+        d1 = vaddq_s16( d1, d1 );                            \
+        d3 = vaddq_s16( d3, d3 );                            \
+        d1 = vsubq_s16( d1, d0 );                            \
+        d3 = vsubq_s16( d3, d2 );                            \
+                                                             \
+        d0 = vaddq_s16( d0, d2 );                            \
+        d1 = vaddq_s16( d1, d3 );                            \
+                                                             \
+        d2 = vaddq_s16( d2, d2 );                            \
+        d3 = vaddq_s16( d3, d3 );                            \
+        d2 = vsubq_s16( d2, d0 );                            \
+        d3 = vsubq_s16( d3, d1 );                            \
+    }
+
+#define ADDSUB( d0, d1 )                \
+    {                                   \
+        int16x8_t v16_a, v16_s;         \
+        v16_a = vaddq_s16( d0, d1 );    \
+        v16_s = vsubq_s16( d1, d0 );    \
+        d0 = v16_a;                     \
+        d1 = v16_s;                     \
+    }
+
+
+#define ABSADDL( sum, vector0, vector1 )         \
+    {                                            \
+        vector0 = vabsq_s16( vector0 );          \
+        vector1 = vabsq_s16( vector1 );          \
+        sum = vaddq_s32( sum, vaddl_s16( vget_low_s16( vector0 ), vget_high_s16( vector0 ) ) );  \
+        sum = vaddq_s32( sum, vaddl_s16( vget_low_s16( vector1 ), vget_high_s16( vector1 ) ) );  \
+    }
+
+int32_t y262_satd_8x8_neon( uint8_t *pui8_blk1, int32_t i_stride1, uint8_t *pui8_blk2, int32_t i_stride2 )
+{
+    int32_t i_satd;
+    int64_t i64_satd;
+    int16x8_t v16_d0, v16_d1, v16_d2, v16_d3, v16_d4, v16_d5, v16_d6, v16_d7;
+    int16x8_t v16_a0, v16_a1, v16_a2, v16_a3;
+    int32x4_t v16_res;
+    int32x2_t v8_hadd0;
+    int64x1_t v8_hadd1;
+
+    v16_res = vmovq_n_s32( 0 );
+
+    v16_d0 = vreinterpretq_s16_u16( vsubl_u8( vld1_u8( pui8_blk1 + i_stride1 * 0 ), vld1_u8( pui8_blk2 + i_stride2 * 0 ) ) );
+    v16_d1 = vreinterpretq_s16_u16( vsubl_u8( vld1_u8( pui8_blk1 + i_stride1 * 1 ), vld1_u8( pui8_blk2 + i_stride2 * 1 ) ) );
+    v16_d2 = vreinterpretq_s16_u16( vsubl_u8( vld1_u8( pui8_blk1 + i_stride1 * 2 ), vld1_u8( pui8_blk2 + i_stride2 * 2 ) ) );
+    v16_d3 = vreinterpretq_s16_u16( vsubl_u8( vld1_u8( pui8_blk1 + i_stride1 * 3 ), vld1_u8( pui8_blk2 + i_stride2 * 3 ) ) );
+    v16_d4 = vreinterpretq_s16_u16( vsubl_u8( vld1_u8( pui8_blk1 + i_stride1 * 4 ), vld1_u8( pui8_blk2 + i_stride2 * 4 ) ) );
+    v16_d5 = vreinterpretq_s16_u16( vsubl_u8( vld1_u8( pui8_blk1 + i_stride1 * 5 ), vld1_u8( pui8_blk2 + i_stride2 * 5 ) ) );
+    v16_d6 = vreinterpretq_s16_u16( vsubl_u8( vld1_u8( pui8_blk1 + i_stride1 * 6 ), vld1_u8( pui8_blk2 + i_stride2 * 6 ) ) );
+    v16_d7 = vreinterpretq_s16_u16( vsubl_u8( vld1_u8( pui8_blk1 + i_stride1 * 7 ), vld1_u8( pui8_blk2 + i_stride2 * 7 ) ) );
+
+    HADAMARD_NEON_4x2( v16_d0, v16_d1, v16_d2, v16_d3 );
+    HADAMARD_NEON_4x2( v16_d4, v16_d5, v16_d6, v16_d7 );
+
+    ADDSUB( v16_d0, v16_d4 );
+    ADDSUB( v16_d1, v16_d5 );
+    ADDSUB( v16_d2, v16_d6 );
+    ADDSUB( v16_d3, v16_d7 );
+
+    v16_a0 = vcombine_s16( vget_low_s16( v16_d0 ), vget_low_s16( v16_d4 ) );
+    v16_a1 = vcombine_s16( vget_high_s16( v16_d0 ), vget_high_s16( v16_d4 ) );
+    ADDSUB( v16_a0, v16_a1 );
+    ABSADDL( v16_res, v16_a0, v16_a1 );
+
+    v16_a0 = vcombine_s16( vget_low_s16( v16_d1 ), vget_low_s16( v16_d5 ) );
+    v16_a1 = vcombine_s16( vget_high_s16( v16_d1 ), vget_high_s16( v16_d5 ) );
+    ADDSUB( v16_a0, v16_a1 );
+    ABSADDL( v16_res, v16_a0, v16_a1 );
+
+    v16_a0 = vcombine_s16( vget_low_s16( v16_d2 ), vget_low_s16( v16_d6 ) );
+    v16_a1 = vcombine_s16( vget_high_s16( v16_d2 ), vget_high_s16( v16_d6 ) );
+    ADDSUB( v16_a0, v16_a1 );
+    ABSADDL( v16_res, v16_a0, v16_a1 );
+
+    v16_a0 = vcombine_s16( vget_low_s16( v16_d3 ), vget_low_s16( v16_d7 ) );
+    v16_a1 = vcombine_s16( vget_high_s16( v16_d3 ), vget_high_s16( v16_d7 ) );
+    ADDSUB( v16_a0, v16_a1 );
+    ABSADDL( v16_res, v16_a0, v16_a1 );
+
+    v8_hadd0 = vadd_s32( vget_low_s32( v16_res ), vget_high_s32( v16_res ) );
+    v8_hadd1 = vpaddl_s32( v8_hadd0 );
+
+    i64_satd = vget_lane_s64( v8_hadd1, 0 );
+    i_satd = ( int32_t )i64_satd;
+
+    i_satd = ( i_satd + 2 ) >> 2;
+
+    return i_satd;
+}
+
+
+int32_t y262_satd_16x16_neon( uint8_t *pui8_blk1, int32_t i_stride1, uint8_t *pui8_blk2, int32_t i_stride2 )
+{
+	int32_t i_satd;
+
+	i_satd  = y262_satd_8x8_neon( pui8_blk1,                         i_stride1, pui8_blk2, i_stride2 );
+	i_satd += y262_satd_8x8_neon( pui8_blk1 + 8,                     i_stride1, pui8_blk2 + 8, i_stride2 );
+	i_satd += y262_satd_8x8_neon( pui8_blk1 + ( 8 * i_stride1 ),     i_stride1, pui8_blk2 + ( 8 * i_stride2 ), i_stride2 );
+	i_satd += y262_satd_8x8_neon( pui8_blk1 + 8 + ( 8 * i_stride1 ), i_stride1, pui8_blk2 + 8 + ( 8 * i_stride2 ), i_stride2 );
+
+	return i_satd;
+}
+
+
+int32_t y262_satd_16x8_neon( uint8_t *pui8_blk1, int32_t i_stride1, uint8_t *pui8_blk2, int32_t i_stride2 )
+{
+	int32_t i_satd;
+
+	i_satd  = y262_satd_8x8_neon( pui8_blk1,                         i_stride1, pui8_blk2, i_stride2 );
+	i_satd += y262_satd_8x8_neon( pui8_blk1 + 8,                     i_stride1, pui8_blk2 + 8, i_stride2 );
+
+	return i_satd;
+}
+
+
+
+int32_t y262_ssd_8x8_neon( uint8_t *pui8_blk1, int32_t i_blk1_stride, uint8_t *pui8_blk2, int32_t i_blk2_stride )
+{
+    int32_t i_ssd;
+    int16x8_t v16_d0, v16_d1, v16_d2, v16_d3;
+    int32x4_t v16_ssd0, v16_ssd1;
+    int32x2_t v8_hadd0;
+    int64x1_t v8_hadd1;
+
+    v16_d0 = vreinterpretq_s16_u16( vsubl_u8( vld1_u8( pui8_blk1 + ( 0 * i_blk1_stride ) ), vld1_u8( pui8_blk2 + ( 0 * i_blk2_stride ) ) ) );
+    v16_d1 = vreinterpretq_s16_u16( vsubl_u8( vld1_u8( pui8_blk1 + ( 1 * i_blk1_stride ) ), vld1_u8( pui8_blk2 + ( 1 * i_blk2_stride ) ) ) );
+    v16_d2 = vreinterpretq_s16_u16( vsubl_u8( vld1_u8( pui8_blk1 + ( 2 * i_blk1_stride ) ), vld1_u8( pui8_blk2 + ( 2 * i_blk2_stride ) ) ) );
+    v16_d3 = vreinterpretq_s16_u16( vsubl_u8( vld1_u8( pui8_blk1 + ( 3 * i_blk1_stride ) ), vld1_u8( pui8_blk2 + ( 3 * i_blk2_stride ) ) ) );
+
+    v16_ssd0 = vmull_s16( vget_low_s16( v16_d0 ), vget_low_s16( v16_d0 ) );
+    v16_ssd1 = vmull_s16( vget_high_s16( v16_d0 ), vget_high_s16( v16_d0 ) );
+
+    v16_ssd0 = vmlal_s16( v16_ssd0, vget_low_s16( v16_d1 ), vget_low_s16( v16_d1 ) );
+    v16_ssd1 = vmlal_s16( v16_ssd1, vget_high_s16( v16_d1 ), vget_high_s16( v16_d1 ) );
+
+    v16_ssd0 = vmlal_s16( v16_ssd0, vget_low_s16( v16_d2 ), vget_low_s16( v16_d2 ) );
+    v16_ssd1 = vmlal_s16( v16_ssd1, vget_high_s16( v16_d2 ), vget_high_s16( v16_d2 ) );
+
+    v16_ssd0 = vmlal_s16( v16_ssd0, vget_low_s16( v16_d3 ), vget_low_s16( v16_d3 ) );
+    v16_ssd1 = vmlal_s16( v16_ssd1, vget_high_s16( v16_d3 ), vget_high_s16( v16_d3 ) );
+
+    v16_d0 = vreinterpretq_s16_u16( vsubl_u8( vld1_u8( pui8_blk1 + ( 4 * i_blk1_stride ) ), vld1_u8( pui8_blk2 + ( 4 * i_blk2_stride ) ) ) );
+    v16_d1 = vreinterpretq_s16_u16( vsubl_u8( vld1_u8( pui8_blk1 + ( 5 * i_blk1_stride ) ), vld1_u8( pui8_blk2 + ( 5 * i_blk2_stride ) ) ) );
+    v16_d2 = vreinterpretq_s16_u16( vsubl_u8( vld1_u8( pui8_blk1 + ( 6 * i_blk1_stride ) ), vld1_u8( pui8_blk2 + ( 6 * i_blk2_stride ) ) ) );
+    v16_d3 = vreinterpretq_s16_u16( vsubl_u8( vld1_u8( pui8_blk1 + ( 7 * i_blk1_stride ) ), vld1_u8( pui8_blk2 + ( 7 * i_blk2_stride ) ) ) );
+
+    v16_ssd0 = vmlal_s16( v16_ssd0, vget_low_s16( v16_d0 ), vget_low_s16( v16_d0 ) );
+    v16_ssd1 = vmlal_s16( v16_ssd1, vget_high_s16( v16_d0 ), vget_high_s16( v16_d0 ) );
+
+    v16_ssd0 = vmlal_s16( v16_ssd0, vget_low_s16( v16_d1 ), vget_low_s16( v16_d1 ) );
+    v16_ssd1 = vmlal_s16( v16_ssd1, vget_high_s16( v16_d1 ), vget_high_s16( v16_d1 ) );
+
+    v16_ssd0 = vmlal_s16( v16_ssd0, vget_low_s16( v16_d2 ), vget_low_s16( v16_d2 ) );
+    v16_ssd1 = vmlal_s16( v16_ssd1, vget_high_s16( v16_d2 ), vget_high_s16( v16_d2 ) );
+
+    v16_ssd0 = vmlal_s16( v16_ssd0, vget_low_s16( v16_d3 ), vget_low_s16( v16_d3 ) );
+    v16_ssd1 = vmlal_s16( v16_ssd1, vget_high_s16( v16_d3 ), vget_high_s16( v16_d3 ) );
+
+    v16_ssd0 = vaddq_s32( v16_ssd0, v16_ssd1 );
+
+    v8_hadd0 = vadd_s32( vget_low_s32( v16_ssd0 ), vget_high_s32( v16_ssd0 ) );
+    v8_hadd0 = vpadd_s32( v8_hadd0, v8_hadd0 );
+
+    i_ssd = vget_lane_s32( v8_hadd0, 0 );
+
+	return i_ssd;
+}
+
+int32_t y262_ssd_16x16_neon( uint8_t *pui8_blk1, int32_t i_blk1_stride, uint8_t *pui8_blk2, int32_t i_blk2_stride )
+{
+	int32_t i_ssd;
+
+	i_ssd = y262_ssd_8x8_neon( pui8_blk1,                              i_blk1_stride, pui8_blk2,                            i_blk2_stride );
+	i_ssd += y262_ssd_8x8_neon( pui8_blk1 + 8,                         i_blk1_stride, pui8_blk2 + 8,                        i_blk2_stride );
+	i_ssd += y262_ssd_8x8_neon( pui8_blk1 + ( 8 * i_blk1_stride ),     i_blk1_stride, pui8_blk2 + ( 8 * i_blk2_stride ),    i_blk2_stride );
+	i_ssd += y262_ssd_8x8_neon( pui8_blk1 + 8 + ( 8 * i_blk1_stride ), i_blk1_stride, pui8_blk2 + 8 + ( 8 * i_blk2_stride), i_blk2_stride );
+	return i_ssd;
+}
+
+
+void y262_sub_8x8_neon( int16_t *pi16_diff, uint8_t *pui8_src1, int32_t i_stride_src1, uint8_t *pui8_src2, int32_t i_stride_src2 )
+{
+    int16x8_t v16_d0, v16_d1, v16_d2, v16_d3;
+
+    v16_d0 = vreinterpretq_s16_u16( vsubl_u8( vld1_u8( pui8_src1 + ( 0 * i_stride_src1 ) ), vld1_u8( pui8_src2 + ( 0 * i_stride_src2 ) ) ) );
+    v16_d1 = vreinterpretq_s16_u16( vsubl_u8( vld1_u8( pui8_src1 + ( 1 * i_stride_src1 ) ), vld1_u8( pui8_src2 + ( 1 * i_stride_src2 ) ) ) );
+    v16_d2 = vreinterpretq_s16_u16( vsubl_u8( vld1_u8( pui8_src1 + ( 2 * i_stride_src1 ) ), vld1_u8( pui8_src2 + ( 2 * i_stride_src2 ) ) ) );
+    v16_d3 = vreinterpretq_s16_u16( vsubl_u8( vld1_u8( pui8_src1 + ( 3 * i_stride_src1 ) ), vld1_u8( pui8_src2 + ( 3 * i_stride_src2 ) ) ) );
+
+    vst1q_s16( pi16_diff +  0, v16_d0 );
+    vst1q_s16( pi16_diff +  8, v16_d1 );
+    vst1q_s16( pi16_diff + 16, v16_d2 );
+    vst1q_s16( pi16_diff + 24, v16_d3 );
+
+    v16_d0 = vreinterpretq_s16_u16( vsubl_u8( vld1_u8( pui8_src1 + ( 4 * i_stride_src1 ) ), vld1_u8( pui8_src2 + ( 4 * i_stride_src2 ) ) ) );
+    v16_d1 = vreinterpretq_s16_u16( vsubl_u8( vld1_u8( pui8_src1 + ( 5 * i_stride_src1 ) ), vld1_u8( pui8_src2 + ( 5 * i_stride_src2 ) ) ) );
+    v16_d2 = vreinterpretq_s16_u16( vsubl_u8( vld1_u8( pui8_src1 + ( 6 * i_stride_src1 ) ), vld1_u8( pui8_src2 + ( 6 * i_stride_src2 ) ) ) );
+    v16_d3 = vreinterpretq_s16_u16( vsubl_u8( vld1_u8( pui8_src1 + ( 7 * i_stride_src1 ) ), vld1_u8( pui8_src2 + ( 7 * i_stride_src2 ) ) ) );
+
+    vst1q_s16( pi16_diff + 32, v16_d0 );
+    vst1q_s16( pi16_diff + 40, v16_d1 );
+    vst1q_s16( pi16_diff + 48, v16_d2 );
+    vst1q_s16( pi16_diff + 56, v16_d3 );
+}
+
+void y262_add_8x8_neon( uint8_t *pui8_destination, int32_t i_destination_stride, uint8_t *pui8_base, int32_t i_base_stride, int16_t *pi16_difference )
+{
+    int32_t i_y;
+
+    int16x8_t v16_zero = vmovq_n_s16( 0 );
+
+    for( i_y = 0; i_y < 8; i_y += 2 )
+    {
+        int16x8_t v16_d0, v16_b0, v16_d1, v16_b1;
+
+        v16_d0 = vld1q_s16( pi16_difference + ( i_y * 8 ) );
+        v16_d1 = vld1q_s16( pi16_difference + ( ( i_y + 1 ) * 8 ) );
+        v16_b0 = vreinterpretq_s16_u16( vshll_n_u8( vld1_u8( pui8_base + ( i_y * i_base_stride ) ), 0 ) );
+        v16_b1 = vreinterpretq_s16_u16( vshll_n_u8( vld1_u8( pui8_base + ( ( i_y + 1 ) * i_base_stride ) ), 0 ) );
+
+        v16_d0 = vaddq_s16( v16_d0, v16_b0 );
+        v16_d1 = vaddq_s16( v16_d1, v16_b1 );
+
+        v16_d0 = vmaxq_s16( v16_zero, v16_d0 );
+        v16_d1 = vmaxq_s16( v16_zero, v16_d1 );
+
+        vst1_u8( pui8_destination + ( i_y * i_destination_stride ), vqmovn_u16( vreinterpretq_u16_s16( v16_d0 ) ) );
+        vst1_u8( pui8_destination + ( ( i_y + 1 ) * i_destination_stride ), vqmovn_u16( vreinterpretq_u16_s16( v16_d1 ) ) );
+    }
+}
+
+
+/* MC */
+
+#define MC_FUNC_NEON( name, i_width, i_height, hpelidx )	\
+void y262_motcomp_##name##_put_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride )		\
+{														\
+	int32_t i_x, i_y;									\
+														\
+	if( hpelidx == 0 )									\
+	{													\
+		for( i_y = 0; i_y < i_height; i_y++ )			\
+		{												\
+            if( i_width == 8 )                          \
+            {                                           \
+                uint8x8_t v8_a;                         \
+                v8_a = vld1_u8( pui8_src );             \
+                vst1_u8( pui8_dst, v8_a );              \
+            }                                           \
+            else if( i_width == 16 )                    \
+            {                                           \
+                uint8x16_t v16_a;                       \
+                v16_a = vld1q_u8( pui8_src );           \
+                vst1q_u8( pui8_dst, v16_a );            \
+            }                                           \
+			pui8_src += i_src_stride;					\
+			pui8_dst += i_dst_stride;					\
+		}												\
+	}													\
+	else if( hpelidx == 1 )								\
+	{													\
+		uint8_t *pui8_src1, *pui8_src2;					\
+														\
+		pui8_src1 = pui8_src;							\
+		pui8_src2 = pui8_src + 1;						\
+														\
+		for( i_y = 0; i_y < i_height; i_y++ )			\
+		{												\
+            if( i_width == 8 )                          \
+            {                                           \
+                uint8x8_t v8_a, v8_b;                   \
+                v8_a = vld1_u8( pui8_src1 );            \
+                v8_b = vld1_u8( pui8_src2 );            \
+                vst1_u8( pui8_dst, vrhadd_u8( v8_a, v8_b ) ); \
+            }                                           \
+            else if( i_width == 16 )                    \
+            {                                           \
+                uint8x16_t v16_a, v16_b;                \
+                v16_a = vld1q_u8( pui8_src1 );          \
+                v16_b = vld1q_u8( pui8_src2 );          \
+                vst1q_u8( pui8_dst, vrhaddq_u8( v16_a, v16_b ) ); \
+            }                                           \
+			pui8_src1 += i_src_stride;					\
+			pui8_src2 += i_src_stride;					\
+			pui8_dst += i_dst_stride;					\
+		}												\
+	}													\
+	else if( hpelidx == 2 )								\
+	{													\
+		uint8_t *pui8_src1, *pui8_src2;					\
+														\
+		pui8_src1 = pui8_src;							\
+		pui8_src2 = pui8_src + i_src_stride;			\
+														\
+		for( i_y = 0; i_y < i_height; i_y++ )			\
+		{												\
+            if( i_width == 8 )                          \
+            {                                           \
+                uint8x8_t v8_a, v8_b;                   \
+                v8_a = vld1_u8( pui8_src1 );            \
+                v8_b = vld1_u8( pui8_src2 );            \
+                vst1_u8( pui8_dst, vrhadd_u8( v8_a, v8_b ) ); \
+            }                                           \
+            else if( i_width == 16 )                    \
+            {                                           \
+                uint8x16_t v16_a, v16_b;                \
+                v16_a = vld1q_u8( pui8_src1 );          \
+                v16_b = vld1q_u8( pui8_src2 );          \
+                vst1q_u8( pui8_dst, vrhaddq_u8( v16_a, v16_b ) ); \
+            }                                           \
+			pui8_src1 += i_src_stride;					\
+			pui8_src2 += i_src_stride;					\
+			pui8_dst += i_dst_stride;					\
+		}												\
+	}													\
+	else												\
+	{													\
+		uint8_t *pui8_src1, *pui8_src2, *pui8_src3, *pui8_src4;		\
+        uint8x8_t v8_a, v8_b, v8_c, v8_d;               \
+        uint8x16_t v16_a, v16_b, v16_c, v16_d;          \
+        uint8x8_t v8_one = vmov_n_u8( 1 );              \
+        uint8x16_t v16_one = vmovq_n_u8( 1 );           \
+														\
+		pui8_src1 = pui8_src;							\
+		pui8_src2 = pui8_src + 1;						\
+		pui8_src3 = pui8_src + i_src_stride;			\
+		pui8_src4 = pui8_src + i_src_stride + 1;		\
+														\
+        if( i_width == 8 )                              \
+        {                                               \
+            v8_a = vld1_u8( pui8_src1 );                \
+            v8_b = vld1_u8( pui8_src2 );                \
+        }                                               \
+        else if( i_width == 16 )                        \
+        {                                               \
+            v16_a = vld1q_u8( pui8_src1 );              \
+            v16_b = vld1q_u8( pui8_src2 );              \
+        }                                               \
+		for( i_y = 0; i_y < i_height; i_y++ )			\
+		{												\
+            if( i_width == 8 )                          \
+            {                                           \
+                uint8x8_t v8_carry0, v8_carry1;         \
+                v8_c = vld1_u8( pui8_src3 );            \
+                v8_d = vld1_u8( pui8_src4 );            \
+                                                        \
+                v8_carry0 = veor_u8( v8_a, v8_c);       \
+                v8_carry1 = veor_u8( v8_b, v8_d);       \
+                                                        \
+                v8_a = vrhadd_u8( v8_a, v8_c );         \
+                v8_b = vrhadd_u8( v8_b, v8_d );         \
+                v8_carry0 = vorr_u8( v8_carry0, v8_carry1 ); \
+                                                        \
+                v8_carry1 = veor_u8( v8_a, v8_b);       \
+                v8_carry0 = vand_u8( v8_carry0, v8_carry1 );    \
+                v8_carry0 = vand_u8( v8_carry0, v8_one );   \
+                                                        \
+                v8_a = vrhadd_u8( v8_a, v8_b );         \
+                v8_a = vsub_u8( v8_a, v8_carry0 );      \
+                                                        \
+                vst1_u8( pui8_dst, v8_a );              \
+                                                        \
+                v8_a = v8_c;                            \
+                v8_b = v8_d;                            \
+            }                                           \
+            else if( i_width == 16 )                    \
+            {                                           \
+                uint8x16_t v16_carry0, v16_carry1;         \
+                v16_c = vld1q_u8( pui8_src3 );            \
+                v16_d = vld1q_u8( pui8_src4 );            \
+                                                        \
+                v16_carry0 = veorq_u8( v16_a, v16_c);       \
+                v16_carry1 = veorq_u8( v16_b, v16_d);       \
+                                                        \
+                v16_a = vrhaddq_u8( v16_a, v16_c );         \
+                v16_b = vrhaddq_u8( v16_b, v16_d );         \
+                v16_carry0 = vorrq_u8( v16_carry0, v16_carry1 ); \
+                                                        \
+                v16_carry1 = veorq_u8( v16_a, v16_b);       \
+                v16_carry0 = vandq_u8( v16_carry0, v16_carry1 );    \
+                v16_carry0 = vandq_u8( v16_carry0, v16_one );   \
+                                                        \
+                v16_a = vrhaddq_u8( v16_a, v16_b );         \
+                v16_a = vsubq_u8( v16_a, v16_carry0 );      \
+                                                        \
+                vst1q_u8( pui8_dst, v16_a );              \
+                                                        \
+                v16_a = v16_c;                            \
+                v16_b = v16_d;                            \
+            }                                           \
+			pui8_src1 += i_src_stride;					\
+			pui8_src2 += i_src_stride;					\
+			pui8_src3 += i_src_stride;					\
+			pui8_src4 += i_src_stride;					\
+			pui8_dst += i_dst_stride;					\
+		}												\
+	}													\
+}														\
+														\
+														\
+void y262_motcomp_##name##_avg_neon( uint8_t *pui8_src, int32_t i_src_stride, uint8_t *pui8_dst, int32_t i_dst_stride )	\
+{														\
+	int32_t i_x, i_y;									\
+														\
+	if( hpelidx == 0 )									\
+	{													\
+		for( i_y = 0; i_y < i_height; i_y++ )			\
+		{												\
+            if( i_width == 8 )                          \
+            {                                           \
+                uint8x8_t v8_a, v8_z;                   \
+                v8_a = vld1_u8( pui8_src );             \
+                v8_z = vld1_u8( pui8_dst );             \
+                vst1_u8( pui8_dst, vrhadd_u8( v8_a, v8_z ) ); \
+            }                                           \
+            else if( i_width == 16 )                    \
+            {                                           \
+                uint8x16_t v16_a, v16_z;                \
+                v16_a = vld1q_u8( pui8_src );           \
+                v16_z = vld1q_u8( pui8_dst );           \
+                vst1q_u8( pui8_dst, vrhaddq_u8( v16_a, v16_z ) ); \
+            }                                           \
+			pui8_src += i_src_stride;					\
+			pui8_dst += i_dst_stride;					\
+		}												\
+	}													\
+	else if( hpelidx == 1 )								\
+	{													\
+		uint8_t *pui8_src1, *pui8_src2;					\
+														\
+		pui8_src1 = pui8_src;							\
+		pui8_src2 = pui8_src + 1;						\
+														\
+		for( i_y = 0; i_y < i_height; i_y++ )			\
+		{												\
+            if( i_width == 8 )                          \
+            {                                           \
+                uint8x8_t v8_a, v8_b, v8_z;             \
+                v8_a = vld1_u8( pui8_src1 );            \
+                v8_b = vld1_u8( pui8_src2 );            \
+                v8_z = vld1_u8( pui8_dst );             \
+                v8_a = vrhadd_u8( v8_a, v8_b );         \
+                vst1_u8( pui8_dst, vrhadd_u8( v8_a, v8_z ) ); \
+            }                                           \
+            else if( i_width == 16 )                    \
+            {                                           \
+                uint8x16_t v16_a, v16_b, v16_z;         \
+                v16_a = vld1q_u8( pui8_src1 );          \
+                v16_b = vld1q_u8( pui8_src2 );          \
+                v16_z = vld1q_u8( pui8_dst );           \
+                v16_a = vrhaddq_u8( v16_a, v16_b );     \
+                vst1q_u8( pui8_dst, vrhaddq_u8( v16_a, v16_z ) ); \
+            }                                           \
+			pui8_src1 += i_src_stride;					\
+			pui8_src2 += i_src_stride;					\
+			pui8_dst += i_dst_stride;					\
+		}												\
+	}													\
+	else if( hpelidx == 2 )								\
+	{													\
+		uint8_t *pui8_src1, *pui8_src2;					\
+														\
+		pui8_src1 = pui8_src;							\
+		pui8_src2 = pui8_src + i_src_stride;			\
+														\
+		for( i_y = 0; i_y < i_height; i_y++ )			\
+		{												\
+            if( i_width == 8 )                          \
+            {                                           \
+                uint8x8_t v8_a, v8_b, v8_z;             \
+                v8_a = vld1_u8( pui8_src1 );            \
+                v8_b = vld1_u8( pui8_src2 );            \
+                v8_z = vld1_u8( pui8_dst );             \
+                v8_a = vrhadd_u8( v8_a, v8_b );         \
+                vst1_u8( pui8_dst, vrhadd_u8( v8_a, v8_z ) ); \
+            }                                           \
+            else if( i_width == 16 )                    \
+            {                                           \
+                uint8x16_t v16_a, v16_b, v16_z;         \
+                v16_a = vld1q_u8( pui8_src1 );          \
+                v16_b = vld1q_u8( pui8_src2 );          \
+                v16_z = vld1q_u8( pui8_dst );           \
+                v16_a = vrhaddq_u8( v16_a, v16_b );     \
+                vst1q_u8( pui8_dst, vrhaddq_u8( v16_a, v16_z ) ); \
+            }                                           \
+			pui8_src1 += i_src_stride;					\
+			pui8_src2 += i_src_stride;					\
+			pui8_dst += i_dst_stride;					\
+		}												\
+	}													\
+	else												\
+	{													\
+		uint8_t *pui8_src1, *pui8_src2, *pui8_src3, *pui8_src4;		\
+        uint8x8_t v8_a, v8_b, v8_c, v8_d, v8_z;         \
+        uint8x16_t v16_a, v16_b, v16_c, v16_d, v16_z;   \
+        uint8x8_t v8_one = vmov_n_u8( 1 );              \
+        uint8x16_t v16_one = vmovq_n_u8( 1 );           \
+														\
+		pui8_src1 = pui8_src;							\
+		pui8_src2 = pui8_src + 1;						\
+		pui8_src3 = pui8_src + i_src_stride;			\
+		pui8_src4 = pui8_src + i_src_stride + 1;		\
+														\
+        if( i_width == 8 )                              \
+        {                                               \
+            v8_a = vld1_u8( pui8_src1 );                \
+            v8_b = vld1_u8( pui8_src2 );                \
+        }                                               \
+        else if( i_width == 16 )                        \
+        {                                               \
+            v16_a = vld1q_u8( pui8_src1 );              \
+            v16_b = vld1q_u8( pui8_src2 );              \
+        }                                               \
+		for( i_y = 0; i_y < i_height; i_y++ )			\
+		{												\
+            if( i_width == 8 )                          \
+            {                                           \
+                uint8x8_t v8_carry0, v8_carry1;         \
+                v8_c = vld1_u8( pui8_src3 );            \
+                v8_d = vld1_u8( pui8_src4 );            \
+                v8_z = vld1_u8( pui8_dst );             \
+                                                        \
+                v8_carry0 = veor_u8( v8_a, v8_c);       \
+                v8_carry1 = veor_u8( v8_b, v8_d);       \
+                                                        \
+                v8_a = vrhadd_u8( v8_a, v8_c );         \
+                v8_b = vrhadd_u8( v8_b, v8_d );         \
+                v8_carry0 = vorr_u8( v8_carry0, v8_carry1 ); \
+                                                        \
+                v8_carry1 = veor_u8( v8_a, v8_b);       \
+                v8_carry0 = vand_u8( v8_carry0, v8_carry1 ); \
+                v8_carry0 = vand_u8( v8_carry0, v8_one ); \
+                                                        \
+                v8_a = vrhadd_u8( v8_a, v8_b );         \
+                v8_a = vsub_u8( v8_a, v8_carry0 );      \
+                                                        \
+                vst1_u8( pui8_dst, vrhadd_u8( v8_a, v8_z ) ); \
+                                                        \
+                v8_a = v8_c;                            \
+                v8_b = v8_d;                            \
+            }                                           \
+            else if( i_width == 16 )                    \
+            {                                           \
+                uint8x16_t v16_carry0, v16_carry1;      \
+                v16_c = vld1q_u8( pui8_src3 );          \
+                v16_d = vld1q_u8( pui8_src4 );          \
+                v16_z = vld1q_u8( pui8_dst );           \
+                                                        \
+                v16_carry0 = veorq_u8( v16_a, v16_c);   \
+                v16_carry1 = veorq_u8( v16_b, v16_d);   \
+                                                        \
+                v16_a = vrhaddq_u8( v16_a, v16_c );     \
+                v16_b = vrhaddq_u8( v16_b, v16_d );     \
+                v16_carry0 = vorrq_u8( v16_carry0, v16_carry1 ); \
+                                                        \
+                v16_carry1 = veorq_u8( v16_a, v16_b);   \
+                v16_carry0 = vandq_u8( v16_carry0, v16_carry1 ); \
+                v16_carry0 = vandq_u8( v16_carry0, v16_one ); \
+                                                        \
+                v16_a = vrhaddq_u8( v16_a, v16_b );     \
+                v16_a = vsubq_u8( v16_a, v16_carry0 );  \
+                                                        \
+                vst1q_u8( pui8_dst, vrhaddq_u8( v16_a, v16_z ) ); \
+                                                        \
+                v16_a = v16_c;                          \
+                v16_b = v16_d;                          \
+            }                                           \
+			pui8_src1 += i_src_stride;					\
+			pui8_src2 += i_src_stride;					\
+			pui8_src3 += i_src_stride;					\
+			pui8_src4 += i_src_stride;					\
+			pui8_dst += i_dst_stride;					\
+		}												\
+	}													\
+}														\
+
+
+MC_FUNC_NEON( 16x16_00, 16, 16, 0 );
+MC_FUNC_NEON( 16x16_01, 16, 16, 1 );
+MC_FUNC_NEON( 16x16_10, 16, 16, 2 );
+MC_FUNC_NEON( 16x16_11, 16, 16, 3 );
+
+MC_FUNC_NEON( 16x8_00, 16, 8, 0 );
+MC_FUNC_NEON( 16x8_01, 16, 8, 1 );
+MC_FUNC_NEON( 16x8_10, 16, 8, 2 );
+MC_FUNC_NEON( 16x8_11, 16, 8, 3 );
+
+MC_FUNC_NEON( 8x16_00, 8, 16, 0 );
+MC_FUNC_NEON( 8x16_01, 8, 16, 1 );
+MC_FUNC_NEON( 8x16_10, 8, 16, 2 );
+MC_FUNC_NEON( 8x16_11, 8, 16, 3 );
+
+MC_FUNC_NEON( 8x8_00, 8, 8, 0 );
+MC_FUNC_NEON( 8x8_01, 8, 8, 1 );
+MC_FUNC_NEON( 8x8_10, 8, 8, 2 );
+MC_FUNC_NEON( 8x8_11, 8, 8, 3 );
+
+MC_FUNC_NEON( 8x4_00, 8, 4, 0 );
+MC_FUNC_NEON( 8x4_01, 8, 4, 1 );
+MC_FUNC_NEON( 8x4_10, 8, 4, 2 );
+MC_FUNC_NEON( 8x4_11, 8, 4, 3 );
+
+
+
+
+
+
+
--- a/src/y262/pixelop_arm64.h
+++ b/src/y262/pixelop_arm64.h
@ -0,0 +1,42 @@
+/*
+Copyright (c) 2013, Ralf Willenbacher
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in
+   the documentation and/or other materials provided with the
+   distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+*/
+
+int32_t y262_sad_16x16_neon( uint8_t *pui8_blk1, int32_t i_stride1, uint8_t *pui8_blk2, int32_t i_stride2 );
+int32_t y262_sad_16x8_neon( uint8_t *pui8_blk1, int32_t i_stride1, uint8_t *pui8_blk2, int32_t i_stride2 );
+
+int32_t y262_satd_16x16_neon( uint8_t *pui8_blk1, int32_t i_stride1, uint8_t *pui8_blk2, int32_t i_stride2 );
+int32_t y262_satd_16x8_neon( uint8_t *pui8_blk1, int32_t i_stride1, uint8_t *pui8_blk2, int32_t i_stride2 );
+
+int32_t y262_ssd_8x8_neon( uint8_t *pui8_blk1, int32_t i_blk1_stride, uint8_t *pui8_blk2, int32_t i_blk2_stride );
+int32_t y262_ssd_16x16_neon( uint8_t *pui8_blk1, int32_t i_blk1_stride, uint8_t *pui8_blk2, int32_t i_blk2_stride );
+
+void y262_sub_8x8_neon( int16_t *pi16_diff, uint8_t *pui8_src1, int32_t i_stride_src1, uint8_t *pui8_src2, int32_t i_stride_src2 );
+void y262_add_8x8_neon( uint8_t *pui8_destination, int32_t i_destination_stride, uint8_t *pui8_base, int32_t i_base_stride, int16_t *pi16_difference );
+
--- a/src/y262/transform_arm64.c
+++ b/src/y262/transform_arm64.c
@ -0,0 +1,444 @@
+/*
+Copyright (c) 2013, Ralf Willenbacher
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in
+   the documentation and/or other materials provided with the
+   distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+*/
+
+//#ifdef ASSEMBLY_ARM64
+
+#include <arm_neon.h>
+
+#include "y262.h"
+
+
+#define RND1BITS ( 11 )
+#define RND2BITS ( 31 - RND1BITS )
+
+
+static const int16_t rgi16_y262_fdct_neon_cs1[ 8 ][ 8 ] = {
+        { 16383,  16383,  16383,  16383,  16383,  16383,  16383,  16383 },
+        { 22724,  19265,  12872,   4520,  -4520, -12872, -19265, -22724 },
+        { 21406,   8867,  -8867, -21406, -21406,  -8867,   8867,  21406 },
+        { 19265,  -4520, -22724, -12872,  12872,  22724,   4520, -19265 },
+        { 16383, -16383, -16383,  16383,  16383, -16383, -16383,  16383 },
+        { 12872, -22724,   4520,  19265, -19265,  -4520,  22724, -12872 },
+        {  8867, -21406,  21406,  -8867,  -8867,  21406, -21406,   8867 },
+        {  4520, -12872,  19265, -22724,  22724, -19265,  12872,  -4520 },
+};
+static const int16_t rgi16_y262_fdct_neon_cs2[ 32 ] = {
+	16385,  16385,  22726,  19266,  -8867, -21408, -22726, -12873,
+	16385,  16385,  12873,   4521,  21408,   8867,  19266,  -4521,
+	16385, -16385,  12873, -22726,  21408,  -8867,  19266, -22726,
+	-16385,  16385,   4521,  19266,   8867, -21408,   4521, -12873,
+};
+
+
+void y262_fdct_neon( int16_t *pi16_block, int16_t *pi16_dst )
+{
+	int i_i, i_j, i_k;
+	int16x8_t rgv16_tmp[ 8 ], rgv16_dsts[ 8 ];
+	int16x8_t rgv16_e[ 4 ], rgv16_ee[ 2 ];
+	int32x4x2_t rgv32_transA[ 4 ];
+	int16x8x2_t rgv32_transB[ 4 ];
+	int16x4_t v8_mt0, v8_mt1, v8_mt2, v8_mt3, v8_mt4, v8_mt5, v8_mt6, v8_mt7;
+
+#define RND( x, y ) ( ( ( x ) + ( ( y ) ? ( 1 << ( y - 1 ) ) : 0 ) ) >> ( y ) )
+#define MUL( x, m ) ( ( x ) * ( m ) )
+
+	for( i_j = 1; i_j < 8; i_j += 2 )
+	{
+		int16x8_t v16_d;
+		int32x4_t v16_s0, v16_s1;
+
+		v16_d = vsubq_s16( vld1q_s16( &pi16_block[ 8 * 0 ] ), vld1q_s16( &pi16_block[ 8 * 7 ] ) );
+		v16_s0 = vmull_n_s16( vget_low_s16( v16_d ), rgi16_y262_fdct_neon_cs1[ i_j ][ 0 ] );
+		v16_s1 = vmull_n_s16( vget_high_s16( v16_d ), rgi16_y262_fdct_neon_cs1[ i_j ][ 0 ] );
+
+		for( i_k = 1; i_k < 4; i_k++ )
+		{
+			v16_d = vsubq_s16( vld1q_s16( &pi16_block[ 8 * i_k ] ), vld1q_s16( &pi16_block[ 8 * ( 7 - i_k ) ] ) );
+			v16_s0 = vmlal_n_s16( v16_s0, vget_low_s16( v16_d ), rgi16_y262_fdct_neon_cs1[ i_j ][ i_k ] );
+			v16_s1 = vmlal_n_s16( v16_s1, vget_high_s16( v16_d ), rgi16_y262_fdct_neon_cs1[ i_j ][ i_k ] );
+		}
+
+		rgv16_tmp[ i_j ] = vcombine_s16( vqrshrn_n_s32( v16_s0, RND1BITS ), vqrshrn_n_s32( v16_s1, RND1BITS ) );
+	}
+
+	for ( i_k = 0; i_k < 4; i_k++ )
+	{
+		rgv16_e[ i_k ] = vaddq_s16( vld1q_s16( &pi16_block[ 8 * i_k ] ), vld1q_s16( &pi16_block[ 8 * ( 7 - i_k ) ] ) );
+	}
+
+	for( i_j = 2; i_j < 8; i_j += 4 )
+	{
+		int16x8_t v16_d;
+		int32x4_t v16_s0, v16_s1;
+
+		v16_d = vsubq_s16( rgv16_e[ 0 ], rgv16_e[ 3 ] );
+		v16_s0 = vmull_n_s16( vget_low_s16( v16_d ), rgi16_y262_fdct_neon_cs1[ i_j ][ 0 ] );
+		v16_s1 = vmull_n_s16( vget_high_s16( v16_d ), rgi16_y262_fdct_neon_cs1[ i_j ][ 0 ] );
+
+		v16_d = vsubq_s16( rgv16_e[ 1 ], rgv16_e[ 2 ] );
+		v16_s0 = vmlal_n_s16( v16_s0, vget_low_s16( v16_d ), rgi16_y262_fdct_neon_cs1[ i_j ][ 1 ] );
+		v16_s1 = vmlal_n_s16( v16_s1, vget_high_s16( v16_d ), rgi16_y262_fdct_neon_cs1[ i_j ][ 1 ] );
+
+		rgv16_tmp[ i_j ] = vcombine_s16( vqrshrn_n_s32( v16_s0, RND1BITS ), vqrshrn_n_s32( v16_s1, RND1BITS ) );
+	}
+	for ( i_k = 0; i_k < 2; i_k++ )
+	{
+		rgv16_ee[ i_k ] = vaddq_s16( rgv16_e[ i_k ], rgv16_e[ 3 - i_k ] );
+	}
+	for( i_j = 0; i_j < 8; i_j += 4 )
+	{
+		int16x8_t v16_d;
+		int32x4_t v16_s0, v16_s1;
+
+		v16_s0 = vmull_n_s16( vget_low_s16( rgv16_ee[ 0 ] ), rgi16_y262_fdct_neon_cs1[ i_j ][ 0 ] );
+		v16_s1 = vmull_n_s16( vget_high_s16( rgv16_ee[ 0 ] ), rgi16_y262_fdct_neon_cs1[ i_j ][ 0 ] );
+
+		v16_s0 = vmlal_n_s16( v16_s0, vget_low_s16( rgv16_ee[ 1 ] ), rgi16_y262_fdct_neon_cs1[ i_j ][ 1 ] );
+		v16_s1 = vmlal_n_s16( v16_s1, vget_high_s16( rgv16_ee[ 1 ] ), rgi16_y262_fdct_neon_cs1[ i_j ][ 1 ] );
+
+		rgv16_tmp[ i_j ] = vcombine_s16( vqrshrn_n_s32( v16_s0, RND1BITS ), vqrshrn_n_s32( v16_s1, RND1BITS ) );
+    }
+
+	v8_mt0 = vld1_s16( &rgi16_y262_fdct_neon_cs2[ 0 ] );
+	v8_mt1 = vld1_s16( &rgi16_y262_fdct_neon_cs2[ 4 ] );
+	v8_mt2 = vld1_s16( &rgi16_y262_fdct_neon_cs2[ 8 ] );
+	v8_mt3 = vld1_s16( &rgi16_y262_fdct_neon_cs2[ 12 ] );
+	v8_mt4 = vld1_s16( &rgi16_y262_fdct_neon_cs2[ 16 ] );
+	v8_mt5 = vld1_s16( &rgi16_y262_fdct_neon_cs2[ 20 ] );
+	v8_mt6 = vld1_s16( &rgi16_y262_fdct_neon_cs2[ 24 ] );
+	v8_mt7 = vld1_s16( &rgi16_y262_fdct_neon_cs2[ 28 ] );
+
+	for( i_j = 0; i_j < 8; i_j++ )
+	{
+		int16x4_t v8_l0, v8_l1, v8_o, v8_e, v8_m0, v8_m1, v8_m2, v8_m3;
+		int16x4x2_t v16_trn0;
+		int32x2x2_t v16_trn1;
+		int32x4_t v16_s0, v16_s1, v16_s2, v16_s3;
+		int16x8_t v16_row;
+
+		v8_l0 = vget_low_s16( rgv16_tmp[ i_j ] );
+		v8_l1 = vget_high_s16( rgv16_tmp[ i_j ] );
+		v8_l1 = vrev64_s16( v8_l1 );
+		v8_o = vsub_s16( v8_l0, v8_l1 );
+		v8_e = vadd_s16( v8_l0, v8_l1 );
+
+		v16_trn1 = vzip_s32( vreinterpret_s32_s16( v8_e ), vreinterpret_s32_s16( v8_o ) );
+		v8_m0 = vreinterpret_s16_s32( v16_trn1.val[ 0 ] );
+		v8_m1 = vreinterpret_s16_s32( v16_trn1.val[ 1 ] );
+
+		v16_s0 = vmull_s16( v8_m0, v8_mt0 );
+		v16_s1 = vmull_s16( v8_m1, v8_mt1 );
+		v16_s0 = vmlal_s16( v16_s0, v8_m1, v8_mt2 );
+		v16_s1 = vmlal_s16( v16_s1, v8_m0, v8_mt3 );
+
+		v16_s2 = vmull_s16( v8_m0, v8_mt4 );
+		v16_s3 = vmull_s16( v8_m1, v8_mt5 );
+		v16_s2 = vmlal_s16( v16_s2, v8_m1, v8_mt6 );
+		v16_s3 = vmlal_s16( v16_s3, v8_m0, v8_mt7 );
+
+		v16_s0 = vpaddq_s32( v16_s0, v16_s1 );
+		v16_s1 = vpaddq_s32( v16_s2, v16_s3 );
+
+		v16_row = vcombine_s16( vmovn_s32( vrshrq_n_s32( v16_s0, RND2BITS ) ), vmovn_s32(vrshrq_n_s32( v16_s1, RND2BITS ) ) );
+		vst1q_s16( pi16_dst + ( 8 * i_j ), v16_row );
+	}
+}
+
+
+#define RND1BITS ( 11 )
+#define RND2BITS ( 31 - RND1BITS )
+
+static const int16_t rgi16_y262_idct_cs1[ 8 ][ 8 ] = {
+        { 16383,  16383,  16383,  16383,  16383,  16383,  16383,  16383 },
+        { 22724,  19265,  12872,   4520,  -4520, -12872, -19265, -22724 },
+        { 21406,   8867,  -8867, -21406, -21406,  -8867,   8867,  21406 },
+        { 19265,  -4520, -22724, -12872,  12872,  22724,   4520, -19265 },
+        { 16383, -16383, -16383,  16383,  16383, -16383, -16383,  16383 },
+        { 12872, -22724,   4520,  19265, -19265,  -4520,  22724, -12872 },
+        {  8867, -21406,  21406,  -8867,  -8867,  21406, -21406,   8867 },
+        {  4520, -12872,  19265, -22724,  22724, -19265,  12872,  -4520 },
+};
+static const int16_t rgi16_y262_idct_cs2[ 8 ][ 8 ] = {
+        { 16385,  16385,  16385,  16385,  16385,  16385,  16385,  16385 },
+        { 22726,  19266,  12873,   4521,  -4521, -12873, -19266, -22726 },
+        { 21408,   8867,  -8867, -21408, -21408,  -8867,   8867,  21408 },
+        { 19266,  -4521, -22726, -12873,  12873,  22726,   4521, -19266 },
+        { 16385, -16385, -16385,  16385,  16385, -16385, -16385,  16385 },
+        { 12873, -22726,   4521,  19266, -19266,  -4521,  22726, -12873 },
+        {  8867, -21408,  21408,  -8867,  -8867,  21408, -21408,   8867 },
+        {  4521, -12873,  19266, -22726,  22726, -19266,  12873,  -4521 },
+};
+
+static const int16_t rgi16_y262_idct_neon_cs2[ 32 ] = {
+	16385,  21408,  16385,   8867,  16385,  -8867,  16385, -21408,
+	16385,   8867, -16385, -21408, -16385,  21408,  16385,  -8867,
+	22726,  19266,  19266,  -4521,  12873, -22726,   4521, -12873,
+	12873,   4521, -22726, -12873,   4521,  19266,  19266, -22726
+};
+
+
+
+void y262_idct_neon( int16_t *pi16_block, int16_t *pi16_dst )
+{
+	int i_j, i_k;
+	int16_t rgi16_tmp[ 64 ];
+	int32_t rgi_e[ 4 ], rgi_o[ 4 ];
+	int32_t rgi_ee[ 2 ], rgi_eo[ 2 ];
+	int32x4_t rgv16_o[ 4 ];
+	int32x4_t rgv16_eo[ 4 ];
+	int32x4_t rgv16_ee[ 4 ];
+	int32x4_t rgv16_e[ 4 ];
+	int16x4_t rgv8_tmp[ 8 ][ 2 ];
+	int16x4_t v8_mt0, v8_mt1, v8_mt2, v8_mt3, v8_mt4, v8_mt5, v8_mt6, v8_mt7;
+
+#define RND( x, y ) ( ( ( x ) + ( ( y ) ? ( 1 << ( y - 1 ) ) : 0 ) ) >> ( y ) )
+#define MUL( x, m ) ( ( x ) * ( m ) )
+
+
+	for( i_j = 0; i_j < 2; i_j++ )
+	{
+		int16x4_t v8_b0, v8_b1, v8_b2, v8_b3;
+
+		v8_b0 = vld1_s16( pi16_block + ( 8 * 1 ) + ( i_j * 4 ) );
+		v8_b1 = vld1_s16( pi16_block + ( 8 * 3 ) + ( i_j * 4 ) );
+		v8_b2 = vld1_s16( pi16_block + ( 8 * 5 ) + ( i_j * 4 ) );
+		v8_b3 = vld1_s16( pi16_block + ( 8 * 7 ) + ( i_j * 4 ) );
+
+		rgv16_o[ 0 ] = vmull_n_s16( v8_b0, rgi16_y262_idct_cs1[ 1 ][ 0 ] );
+		rgv16_o[ 0 ] = vmlal_n_s16( rgv16_o[ 0 ], v8_b1, rgi16_y262_idct_cs1[ 3 ][ 0 ] );
+		rgv16_o[ 0 ] = vmlal_n_s16( rgv16_o[ 0 ], v8_b2, rgi16_y262_idct_cs1[ 5 ][ 0 ] );
+		rgv16_o[ 0 ] = vmlal_n_s16( rgv16_o[ 0 ], v8_b3, rgi16_y262_idct_cs1[ 7 ][ 0 ] );
+
+		rgv16_o[ 1 ] = vmull_n_s16( v8_b0, rgi16_y262_idct_cs1[ 1 ][ 1 ] );
+		rgv16_o[ 1 ] = vmlal_n_s16( rgv16_o[ 1 ], v8_b1, rgi16_y262_idct_cs1[ 3 ][ 1 ] );
+		rgv16_o[ 1 ] = vmlal_n_s16( rgv16_o[ 1 ], v8_b2, rgi16_y262_idct_cs1[ 5 ][ 1 ] );
+		rgv16_o[ 1 ] = vmlal_n_s16( rgv16_o[ 1 ], v8_b3, rgi16_y262_idct_cs1[ 7 ][ 1 ] );
+
+		rgv16_o[ 2 ] = vmull_n_s16( v8_b0, rgi16_y262_idct_cs1[ 1 ][ 2 ] );
+		rgv16_o[ 2 ] = vmlal_n_s16( rgv16_o[ 2 ], v8_b1, rgi16_y262_idct_cs1[ 3 ][ 2 ] );
+		rgv16_o[ 2 ] = vmlal_n_s16( rgv16_o[ 2 ], v8_b2, rgi16_y262_idct_cs1[ 5 ][ 2 ] );
+		rgv16_o[ 2 ] = vmlal_n_s16( rgv16_o[ 2 ], v8_b3, rgi16_y262_idct_cs1[ 7 ][ 2 ] );
+
+		rgv16_o[ 3 ] = vmull_n_s16( v8_b0, rgi16_y262_idct_cs1[ 1 ][ 3 ] );
+		rgv16_o[ 3 ] = vmlal_n_s16( rgv16_o[ 3 ], v8_b1, rgi16_y262_idct_cs1[ 3 ][ 3 ] );
+		rgv16_o[ 3 ] = vmlal_n_s16( rgv16_o[ 3 ], v8_b2, rgi16_y262_idct_cs1[ 5 ][ 3 ] );
+		rgv16_o[ 3 ] = vmlal_n_s16( rgv16_o[ 3 ], v8_b3, rgi16_y262_idct_cs1[ 7 ][ 3 ] );
+
+		v8_b0 = vld1_s16( pi16_block + ( 8 * 2 ) + ( i_j * 4 ) );
+		v8_b1 = vld1_s16( pi16_block + ( 8 * 6 ) + ( i_j * 4 ) );
+		v8_b2 = vld1_s16( pi16_block + ( 8 * 0 ) + ( i_j * 4 ) );
+		v8_b3 = vld1_s16( pi16_block + ( 8 * 4 ) + ( i_j * 4 ) );
+
+		rgv16_eo[ 0 ] = vmull_n_s16( v8_b0, rgi16_y262_idct_cs1[ 2 ][ 0 ] );
+		rgv16_eo[ 0 ] = vmlal_n_s16( rgv16_eo[ 0 ], v8_b1, rgi16_y262_idct_cs1[ 6 ][ 0 ] );
+		rgv16_eo[ 1 ] = vmull_n_s16( v8_b0, rgi16_y262_idct_cs1[ 2 ][ 1 ] );
+		rgv16_eo[ 1 ] = vmlal_n_s16( rgv16_eo[ 1 ], v8_b1, rgi16_y262_idct_cs1[ 6 ][ 1 ] );
+		rgv16_ee[ 0 ] = vmull_n_s16( v8_b2, rgi16_y262_idct_cs1[ 0 ][ 0 ] );
+		rgv16_ee[ 0 ] = vmlal_n_s16( rgv16_ee[ 0 ], v8_b3, rgi16_y262_idct_cs1[ 4 ][ 0 ] );
+		rgv16_ee[ 1 ] = vmull_n_s16( v8_b2, rgi16_y262_idct_cs1[ 0 ][ 1 ] );
+		rgv16_ee[ 1 ] = vmlal_n_s16( rgv16_ee[ 1 ], v8_b3, rgi16_y262_idct_cs1[ 4 ][ 1 ] );
+
+		rgv16_e[ 0 ] = vaddq_s32( rgv16_ee[ 0 ], rgv16_eo[ 0 ] );
+		rgv16_e[ 1 ] = vaddq_s32( rgv16_ee[ 1 ], rgv16_eo[ 1 ] );
+		rgv16_e[ 2 ] = vsubq_s32( rgv16_ee[ 1 ], rgv16_eo[ 1 ] );
+		rgv16_e[ 3 ] = vsubq_s32( rgv16_ee[ 0 ], rgv16_eo[ 0 ] );
+
+
+		for( i_k = 0; i_k < 4; i_k++ )
+		{
+			int32x4_t v16_eoa, v16_eos;
+			v16_eoa = vaddq_s32( rgv16_e[ i_k ], rgv16_o[ i_k ]);
+			rgv8_tmp[ i_k ][ i_j ] = vqrshrn_n_s32( v16_eoa, RND1BITS );
+			v16_eos = vsubq_s32( rgv16_e[ 3 - i_k ], rgv16_o[ 3 - i_k ]);
+			rgv8_tmp[ i_k + 4 ][ i_j ] = vqrshrn_n_s32( v16_eos, RND1BITS );
+		}
+	}
+
+	v8_mt0 = vld1_s16( &rgi16_y262_idct_neon_cs2[ 0 ] );
+	v8_mt1 = vld1_s16( &rgi16_y262_idct_neon_cs2[ 4 ] );
+	v8_mt2 = vld1_s16( &rgi16_y262_idct_neon_cs2[ 8 ] );
+	v8_mt3 = vld1_s16( &rgi16_y262_idct_neon_cs2[ 12 ] );
+	v8_mt4 = vld1_s16( &rgi16_y262_idct_neon_cs2[ 16 ] );
+	v8_mt5 = vld1_s16( &rgi16_y262_idct_neon_cs2[ 20 ] );
+	v8_mt6 = vld1_s16( &rgi16_y262_idct_neon_cs2[ 24 ] );
+	v8_mt7 = vld1_s16( &rgi16_y262_idct_neon_cs2[ 28 ] );
+
+	for( i_j = 0; i_j < 8; i_j++ )
+	{
+		int16x4_t v8_l02, v8_l46, v8_l13, v8_l57, v8_m0, v8_m1, v8_m2, v8_m3, v8_m4, v8_m5, v8_m6, v8_m7;
+		int16x4x2_t v16_trn0;
+		int32x2x2_t v16_trn1;
+		int32x4_t v16_s0, v16_s1, v16_s2, v16_s3, v16_s4, v16_s5, v16_s6, v16_s7, v16_e, v16_o;
+		int16x8_t v16_row;
+
+		v8_m0 = rgv8_tmp[ i_j ][ 0 ];
+		v8_m1 = rgv8_tmp[ i_j ][ 1 ];
+
+		v16_trn1 = vtrn_s32( vreinterpret_s32_s16( v8_m0 ), vreinterpret_s32_s16( v8_m1 ) );
+		v16_trn0 = vzip_s16( vreinterpret_s16_s32( v16_trn1.val[ 0 ] ),vreinterpret_s16_s32( v16_trn1.val[ 1 ] ) );
+
+		v16_trn1 = vtrn_s32( vreinterpret_s32_s16( v16_trn0.val[ 0 ] ), vreinterpret_s32_s16( v16_trn0.val[ 0 ] ) );
+		v8_l02 = vreinterpret_s16_s32( v16_trn1.val[ 0 ] );
+		v8_l13 = vreinterpret_s16_s32( v16_trn1.val[ 1 ] );
+
+		v16_trn1 = vtrn_s32( vreinterpret_s32_s16( v16_trn0.val[ 1 ] ), vreinterpret_s32_s16( v16_trn0.val[ 1 ] ) );
+		v8_l46 = vreinterpret_s16_s32( v16_trn1.val[ 0 ] );
+		v8_l57 = vreinterpret_s16_s32( v16_trn1.val[ 1 ] );
+
+		v16_s0 = vmull_s16( v8_l02, v8_mt0 );
+		v16_s0 = vmlal_s16( v16_s0, v8_l46, v8_mt2 );
+		v16_s1 = vmull_s16( v8_l02, v8_mt1 );
+		v16_s1 = vmlal_s16( v16_s1, v8_l46, v8_mt3 );
+		v16_s2 = vmull_s16( v8_l13, v8_mt4 );
+		v16_s2 = vmlal_s16( v16_s2, v8_l57, v8_mt6 );
+		v16_s3 = vmull_s16( v8_l13, v8_mt5 );
+		v16_s3 = vmlal_s16( v16_s3, v8_l57, v8_mt7 );
+
+		v16_s0 = vpaddq_s32( v16_s0, v16_s1 );
+		v16_s1 = vpaddq_s32( v16_s2, v16_s3 );
+
+		v16_e = vaddq_s32( v16_s0, v16_s1 );
+		v16_o = vsubq_s32( v16_s0, v16_s1 );
+		v16_o = vcombine_s32( vrev64_s32( vget_high_s32( v16_o ) ), vrev64_s32( vget_low_s32( v16_o ) ) );
+
+		v16_row = vcombine_s16( vmovn_s32( vrshrq_n_s32( v16_e, RND2BITS ) ), vmovn_s32(vrshrq_n_s32( v16_o, RND2BITS ) ) );
+		vst1q_s16( pi16_dst + ( 8 * i_j ), v16_row );
+	}
+}
+
+
+int32_t y262_quant8x8_intra_fw_mpeg2_neon( int16_t *pi_coeffs, int32_t i_stride, uint16_t *pui16_qmat, uint16_t *pui16_bias )
+{
+	int32_t i_y, i_x, i_qm, i_nz, i_intra_dc;
+	int16x8_t v16_zero, v16_nz, v16_2047, v16_m2048;
+
+	v16_nz = v16_zero = vmovq_n_s16( 0 );
+	v16_2047 = vmovq_n_s16( 2047 );
+	v16_m2048 = vmovq_n_s16( -2048 );
+
+	i_intra_dc = pi_coeffs[ 0 ];
+	pi_coeffs[ 0 ] = 0;
+
+	i_nz = 0;
+	for( i_y = 0; i_y < 8; i_y++ )
+	{
+		int16x8_t v16_co;
+		int16x8_t v16_mask;
+		uint16x8_t v16_qm, v16_bias, v16_cou;
+		uint32x4_t v16_res0, v16_res1;
+
+		v16_co = vld1q_s16( pi_coeffs + ( i_y * 8 ) );
+		v16_qm = vld1q_u16( pui16_qmat + ( i_y * 8 ) );
+		v16_bias = vld1q_u16( pui16_bias + ( i_y * 8 ) );
+		v16_mask = vreinterpretq_s16_u16( vcgtq_s16( v16_zero, v16_co ) );
+		v16_co = veorq_s16( v16_co, v16_mask );
+		v16_co = vsubq_s16( v16_co, v16_mask );
+
+		v16_cou = vaddq_u16( vreinterpretq_u16_s16( v16_co ), v16_bias );
+		v16_res0 = vmull_u16( vget_low_u16( v16_cou ), vget_low_u16( v16_qm ) );
+		v16_res1 = vmull_u16( vget_high_u16( v16_cou ), vget_high_u16( v16_qm ) );
+
+		v16_co = vreinterpretq_s16_u16( vcombine_u16( vshrn_n_u32( v16_res0, 16 ), vshrn_n_u32( v16_res1, 16 ) ) );
+
+		v16_co = veorq_s16( v16_co, v16_mask );
+		v16_co = vsubq_s16( v16_co, v16_mask );
+
+		v16_co = vminq_s16( v16_co, v16_2047 );
+		v16_co = vmaxq_s16( v16_co, v16_m2048 );
+
+		vst1q_s16( pi_coeffs + ( i_y * 8 ), v16_co );
+
+		v16_nz = vorrq_s16( v16_co, v16_nz );
+	}
+
+	v16_nz = vnegq_s16( vreinterpretq_s16_u16( vceqzq_s16( v16_nz ) ) );
+	i_nz = vgetq_lane_u32( vreinterpretq_u32_s16( v16_nz ), 0 );
+	i_nz |= vgetq_lane_u32( vreinterpretq_u32_s16( v16_nz ), 1 );
+	i_nz |= vgetq_lane_u32( vreinterpretq_u32_s16( v16_nz ), 2 );
+	i_nz |= vgetq_lane_u32( vreinterpretq_u32_s16( v16_nz ), 3 );
+
+	pi_coeffs[ 0 ] = i_intra_dc;
+
+	return i_nz;
+}
+
+
+
+int32_t y262_quant8x8_inter_fw_mpeg2_neon( int16_t *pi_coeffs, int32_t i_stride, uint16_t *pui16_qmat )
+{
+	int32_t i_y, i_x, i_qm, i_nz;
+	int16x8_t v16_zero, v16_nz, v16_2047, v16_m2048;
+
+	v16_nz = v16_zero = vmovq_n_s16( 0 );
+	v16_2047 = vmovq_n_s16( 2047 );
+	v16_m2048 = vmovq_n_s16( -2048 );
+
+	i_nz = 0;
+	for( i_y = 0; i_y < 8; i_y++ )
+	{
+		int16x8_t v16_co;
+		int16x8_t v16_mask;
+		uint16x8_t v16_qm;
+		uint32x4_t v16_res0, v16_res1;
+
+		v16_co = vld1q_s16( pi_coeffs + ( i_y * 8 ) );
+		v16_qm = vld1q_u16( pui16_qmat + ( i_y * 8 ) );
+		v16_mask = vreinterpretq_s16_u16( vcgtq_s16( v16_zero, v16_co ) );
+		v16_co = veorq_s16( v16_co, v16_mask );
+		v16_co = vsubq_s16( v16_co, v16_mask );
+
+		v16_res0 = vmull_u16( vreinterpret_u16_s16( vget_low_s16( v16_co ) ), vget_low_u16( v16_qm ) );
+		v16_res1 = vmull_u16( vreinterpret_u16_s16( vget_high_s16( v16_co ) ), vget_high_u16( v16_qm ) );
+
+		v16_co = vreinterpretq_s16_u16( vcombine_u16( vshrn_n_u32( v16_res0, 16 ), vshrn_n_u32( v16_res1, 16 ) ) );
+
+		v16_co = veorq_s16( v16_co, v16_mask );
+		v16_co = vsubq_s16( v16_co, v16_mask );
+
+		v16_co = vminq_s16( v16_co, v16_2047 );
+		v16_co = vmaxq_s16( v16_co, v16_m2048 );
+
+		vst1q_s16( pi_coeffs + ( i_y * 8 ), v16_co );
+
+		v16_nz = vorrq_s16( v16_co, v16_nz );
+	}
+
+	v16_nz = vnegq_s16( vreinterpretq_s16_u16( vceqzq_s16( v16_nz ) ) );
+	i_nz = vgetq_lane_u32( vreinterpretq_u32_s16( v16_nz ), 0 );
+	i_nz |= vgetq_lane_u32( vreinterpretq_u32_s16( v16_nz ), 1 );
+	i_nz |= vgetq_lane_u32( vreinterpretq_u32_s16( v16_nz ), 2 );
+	i_nz |= vgetq_lane_u32( vreinterpretq_u32_s16( v16_nz ), 3 );
+
+	return i_nz;
+}
+
+
+
+//#endif
--- a/src/y262/transform_arm64.h
+++ b/src/y262/transform_arm64.h
@ -0,0 +1,36 @@
+/*
+Copyright (c) 2013, Ralf Willenbacher
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in
+   the documentation and/or other materials provided with the
+   distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+*/
+
+void y262_fdct_neon( int16_t *pi16_block, int16_t *pi16_dst );
+void y262_idct_neon( int16_t *pi16_block, int16_t *pi16_dst );
+
+int32_t y262_quant8x8_intra_fw_mpeg2_neon( int16_t *pi_coeffs, int32_t i_stride, uint16_t *pui16_qmat, uint16_t *pui16_bias );
+int32_t y262_quant8x8_inter_fw_mpeg2_neon( int16_t *pi_coeffs, int32_t i_stride, uint16_t *pui16_qmat );
+
--- a/src/y262/y262.h
+++ b/src/y262/y262.h
@ -59,11 +59,19 @@ POSSIBILITY OF SUCH DAMAGE.
 #include "transform.h"
 #include "pixelop.h"
 #include "me.h"
-#include "transform_x86.h"
-#include "pixelop_x86.h"
 #include "ratectrl.h"
 #include "threads.h"

+#ifdef ASSEMBLY_X86
+#include "transform_x86.h"
+#include "pixelop_x86.h"
+#endif
+
+#ifdef ASSEMBLY_ARM64
+#include "transform_arm64.h"
+#include "pixelop_arm64.h"
+#endif
+
 void y262_init_motion_compensation( y262_t *ps_y262 );
 void y262_error( y262_t *ps_y262, int32_t i_error_code, int8_t* pi8_format, ... );
 void y262_encode_picture( y262_t *ps_y262, y262_picture_t *ps_picture, int32_t i_picture_type, int32_t i_pon );
--- a/src/y262/y262api.c
+++ b/src/y262/y262api.c
@ -548,6 +548,7 @@ int32_t y262_initialize( void *p_y262, y262_configuration_t *ps_config )

 	ps_y262->s_funcs.f_fdct_8x8 = y262_fdct_c;
 	ps_y262->s_funcs.f_idct_8x8 = y262_idct_c;
+#ifdef ASSEMBLY_X86
 	if( 1 )
 	{	
 		ps_y262->s_funcs.rgf_sad[ BLOCK_TYPE_16x16 ] = y262_sad_16x16_sse2;
@ -565,7 +566,29 @@ int32_t y262_initialize( void *p_y262, y262_configuration_t *ps_config )
 		ps_y262->s_funcs.f_fdct_8x8 = y262_fdct_sse2;
 		ps_y262->s_funcs.f_idct_8x8 = y262_idct_sse2;
 	}
+#endif

+#ifdef ASSEMBLY_ARM64
+	if( 1 )
+	{
+		ps_y262->s_funcs.rgf_sad[ BLOCK_TYPE_16x16 ] = y262_sad_16x16_neon;
+		ps_y262->s_funcs.rgf_sad[ BLOCK_TYPE_16x8 ] = y262_sad_16x8_neon;
+		ps_y262->s_funcs.rgf_satd[ BLOCK_TYPE_16x16 ] = y262_satd_16x16_neon;
+		ps_y262->s_funcs.rgf_satd[ BLOCK_TYPE_16x8 ] = y262_satd_16x8_neon;
+
+		ps_y262->s_funcs.f_ssd_16x16 = y262_ssd_16x16_neon;
+		ps_y262->s_funcs.f_ssd_8x8 = y262_ssd_8x8_neon;
+		ps_y262->s_funcs.f_add_8x8 = y262_add_8x8_neon;
+		ps_y262->s_funcs.f_sub_8x8 = y262_sub_8x8_neon;
+		ps_y262->s_funcs.f_quant8x8_intra_fw = y262_quant8x8_intra_fw_mpeg2_neon;
+		ps_y262->s_funcs.f_quant8x8_inter_fw = y262_quant8x8_inter_fw_mpeg2_neon;
+
+		ps_y262->s_funcs.f_fdct_8x8 = y262_fdct_neon;
+		ps_y262->s_funcs.f_idct_8x8 = y262_idct_neon;
+
+
+	}
+#endif

 	memset( ps_y262->rgi_y262_motion_bits_x, 0, sizeof( ps_y262->rgi_y262_motion_bits_x ) );
 	memset( ps_y262->rgi_y262_motion_bits_y, 1, sizeof( ps_y262->rgi_y262_motion_bits_y ) );
--- a/src/y262app/CMakeLists.txt
+++ b/src/y262app/CMakeLists.txt
@ -1,12 +1,6 @@
 cmake_minimum_required(VERSION 3.1)
 project(y262app)

-if(CMAKE_SIZEOF_VOID_P EQUAL 8)
-	set(ARCH "_x64")
-else()
-	set(ARCH "_x86")
-endif()
-
 set( SRC_FILES 
 	${CMAKE_CURRENT_SOURCE_DIR}/main.c
 )
@ -14,7 +8,7 @@ set( SRC_FILES
 add_executable(y262app ${SRC_FILES})
 target_link_libraries(y262app liby262)
 set_target_properties(y262app PROPERTIES 
-	OUTPUT_NAME "y262$<$<CONFIG:Debug>:d>${ARCH}"
+	OUTPUT_NAME "y262$<$<CONFIG:Debug>:d>"
 	RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/bin"
 	POSITION_INDEPENDENT_CODE ON
 )