// SONIC ROBO BLAST 2
//-----------------------------------------------------------------------------
// Copyright (C) 1993-1996 by id Software, Inc.
// Copyright (C) 1998-2000 by DooM Legacy Team.
// Copyright (C) 1999-2023 by Sonic Team Junior.
//
// This program is free software distributed under the
// terms of the GNU General Public License, version 2.
// See the 'LICENSE' file for more details.
//-----------------------------------------------------------------------------
/// \file  m_memcpy.c
/// \brief X86 optimized implementations of M_Memcpy

#include "doomdef.h"
#include "m_misc.h"

#if defined (__GNUC__) && defined (__i386__) // from libkwave, under GPL
// Alam: note libkwave memcpy code comes from mplayer's libvo/aclib_template.c, r699

/* for small memory blocks (<256 bytes) this version is faster */
#define small_memcpy(dest,src,n)\
{\
register unsigned long int dummy;\
__asm__ __volatile__(\
	"cld\n\t"\
	"rep; movsb"\
	:"=&D"(dest), "=&S"(src), "=&c"(dummy)\
	:"0" (dest), "1" (src),"2" (n)\
	: "memory", "cc");\
}

/* linux kernel __memcpy (from: /include/asm/string.h) */
ATTRINLINE static FUNCINLINE void *__memcpy (void *dest, const void * src, size_t n)
{
	int d0, d1, d2;

	if ( n < 4 )
	{
		small_memcpy(dest, src, n);
	}
	else
	{
		__asm__ __volatile__ (
			"rep ; movsl;"
			"testb $2,%b4;"
			"je 1f;"
			"movsw;"
			"1:\ttestb $1,%b4;"
			"je 2f;"
			"movsb;"
			"2:"
		: "=&c" (d0), "=&D" (d1), "=&S" (d2)
		:"0" (n/4), "q" (n),"1" ((long) dest),"2" ((long) src)
		: "memory");
	}

	return dest;
}

#define SSE_MMREG_SIZE 16
#define MMX_MMREG_SIZE 8

#define MMX1_MIN_LEN 0x800  /* 2K blocks */
#define MIN_LEN 0x40  /* 64-byte blocks */

/* SSE note: i tried to move 128 bytes a time instead of 64 but it
didn't make any measureable difference. i'm using 64 for the sake of
simplicity. [MF] */
static /*FUNCTARGET("sse2")*/ void *sse_cpy(void * dest, const void * src, size_t n)
{
	void *retval = dest;
	size_t i;

	/* PREFETCH has effect even for MOVSB instruction ;) */
	__asm__ __volatile__ (
		"prefetchnta (%0);"
		"prefetchnta 32(%0);"
		"prefetchnta 64(%0);"
		"prefetchnta 96(%0);"
		"prefetchnta 128(%0);"
		"prefetchnta 160(%0);"
		"prefetchnta 192(%0);"
		"prefetchnta 224(%0);"
		"prefetchnta 256(%0);"
		"prefetchnta 288(%0);"
		: : "r" (src) );

	if (n >= MIN_LEN)
	{
		register unsigned long int delta;
		/* Align destinition to MMREG_SIZE -boundary */
		delta = ((unsigned long int)dest)&(SSE_MMREG_SIZE-1);
		if (delta)
		{
			delta=SSE_MMREG_SIZE-delta;
			n -= delta;
			small_memcpy(dest, src, delta);
		}
		i = n >> 6; /* n/64 */
		n&=63;
		if (((unsigned long)src) & 15)
		/* if SRC is misaligned */
		 for (; i>0; i--)
		 {
			__asm__ __volatile__ (
				"prefetchnta 320(%0);"
				"prefetchnta 352(%0);"
				"movups (%0), %%xmm0;"
				"movups 16(%0), %%xmm1;"
				"movups 32(%0), %%xmm2;"
				"movups 48(%0), %%xmm3;"
				"movntps %%xmm0, (%1);"
				"movntps %%xmm1, 16(%1);"
				"movntps %%xmm2, 32(%1);"
				"movntps %%xmm3, 48(%1);"
			:: "r" (src), "r" (dest) : "memory");
			src = (const unsigned char *)src + 64;
			dest = (unsigned char *)dest + 64;
		}
		else
			/*
			   Only if SRC is aligned on 16-byte boundary.
			   It allows to use movaps instead of movups, which required data
			   to be aligned or a general-protection exception (#GP) is generated.
			*/
		 for (; i>0; i--)
		 {
			__asm__ __volatile__ (
				"prefetchnta 320(%0);"
				"prefetchnta 352(%0);"
				"movaps (%0), %%xmm0;"
				"movaps 16(%0), %%xmm1;"
				"movaps 32(%0), %%xmm2;"
				"movaps 48(%0), %%xmm3;"
				"movntps %%xmm0, (%1);"
				"movntps %%xmm1, 16(%1);"
				"movntps %%xmm2, 32(%1);"
				"movntps %%xmm3, 48(%1);"
			:: "r" (src), "r" (dest) : "memory");
			src = ((const unsigned char *)src) + 64;
			dest = ((unsigned char *)dest) + 64;
		}
		/* since movntq is weakly-ordered, a "sfence"
		 * is needed to become ordered again. */
		__asm__ __volatile__ ("sfence":::"memory");
		/* enables to use FPU */
		__asm__ __volatile__ ("emms":::"memory");
	}
	/*
	 *	Now do the tail of the block
	 */
	if (n) __memcpy(dest, src, n);
	return retval;
}

static FUNCTARGET("mmx") void *mmx2_cpy(void *dest, const void *src, size_t n)
{
	void *retval = dest;
	size_t i;

	/* PREFETCH has effect even for MOVSB instruction ;) */
	__asm__ __volatile__ (
		"prefetchnta (%0);"
		"prefetchnta 32(%0);"
		"prefetchnta 64(%0);"
		"prefetchnta 96(%0);"
		"prefetchnta 128(%0);"
		"prefetchnta 160(%0);"
		"prefetchnta 192(%0);"
		"prefetchnta 224(%0);"
		"prefetchnta 256(%0);"
		"prefetchnta 288(%0);"
	: : "r" (src));

	if (n >= MIN_LEN)
	{
		register unsigned long int delta;
		/* Align destinition to MMREG_SIZE -boundary */
		delta = ((unsigned long int)dest)&(MMX_MMREG_SIZE-1);
		if (delta)
		{
			delta=MMX_MMREG_SIZE-delta;
			n -= delta;
			small_memcpy(dest, src, delta);
		}
		i = n >> 6; /* n/64 */
		n&=63;
		for (; i>0; i--)
		{
			__asm__ __volatile__ (
				"prefetchnta 320(%0);"
				"prefetchnta 352(%0);"
				"movq (%0), %%mm0;"
				"movq 8(%0), %%mm1;"
				"movq 16(%0), %%mm2;"
				"movq 24(%0), %%mm3;"
				"movq 32(%0), %%mm4;"
				"movq 40(%0), %%mm5;"
				"movq 48(%0), %%mm6;"
				"movq 56(%0), %%mm7;"
				"movntq %%mm0, (%1);"
				"movntq %%mm1, 8(%1);"
				"movntq %%mm2, 16(%1);"
				"movntq %%mm3, 24(%1);"
				"movntq %%mm4, 32(%1);"
				"movntq %%mm5, 40(%1);"
				"movntq %%mm6, 48(%1);"
				"movntq %%mm7, 56(%1);"
			:: "r" (src), "r" (dest) : "memory");
			src = ((const unsigned char *)src) + 64;
			dest = ((unsigned char *)dest) + 64;
		}
		/* since movntq is weakly-ordered, a "sfence"
		* is needed to become ordered again. */
		__asm__ __volatile__ ("sfence":::"memory");
		__asm__ __volatile__ ("emms":::"memory");
	}
	/*
	 *	Now do the tail of the block
	 */
	if (n) __memcpy(dest, src, n);
	return retval;
}

static FUNCTARGET("mmx") void *mmx1_cpy(void *dest, const void *src, size_t n) //3DNOW
{
	void *retval = dest;
	size_t i;

	/* PREFETCH has effect even for MOVSB instruction ;) */
	__asm__ __volatile__ (
		"prefetch (%0);"
		"prefetch 32(%0);"
		"prefetch 64(%0);"
		"prefetch 96(%0);"
		"prefetch 128(%0);"
		"prefetch 160(%0);"
		"prefetch 192(%0);"
		"prefetch 224(%0);"
		"prefetch 256(%0);"
		"prefetch 288(%0);"
	: : "r" (src));

	if (n >= MMX1_MIN_LEN)
	{
		register unsigned long int delta;
		/* Align destinition to MMREG_SIZE -boundary */
		delta = ((unsigned long int)dest)&(MMX_MMREG_SIZE-1);
		if (delta)
		{
			delta=MMX_MMREG_SIZE-delta;
			n -= delta;
			small_memcpy(dest, src, delta);
		}
		i = n >> 6; /* n/64 */
		n&=63;
		for (; i>0; i--)
		{
			__asm__ __volatile__ (
				"prefetch 320(%0);"
				"prefetch 352(%0);"
				"movq (%0), %%mm0;"
				"movq 8(%0), %%mm1;"
				"movq 16(%0), %%mm2;"
				"movq 24(%0), %%mm3;"
				"movq 32(%0), %%mm4;"
				"movq 40(%0), %%mm5;"
				"movq 48(%0), %%mm6;"
				"movq 56(%0), %%mm7;"
				"movq %%mm0, (%1);"
				"movq %%mm1, 8(%1);"
				"movq %%mm2, 16(%1);"
				"movq %%mm3, 24(%1);"
				"movq %%mm4, 32(%1);"
				"movq %%mm5, 40(%1);"
				"movq %%mm6, 48(%1);"
				"movq %%mm7, 56(%1);"
			:: "r" (src), "r" (dest) : "memory");
			src = ((const unsigned char *)src) + 64;
			dest = ((unsigned char *)dest) + 64;
		}
		__asm__ __volatile__ ("femms":::"memory"); // same as mmx_cpy() but with a femms
	}
	/*
	 *	Now do the tail of the block
	 */
	if (n) __memcpy(dest, src, n);
	return retval;
}
#endif

// Alam: why? memcpy may be __cdecl/_System and our code may be not the same type
static void *cpu_cpy(void *dest, const void *src, size_t n)
{
	if (src == NULL)
	{
		CONS_Debug(DBG_MEMORY, "Memcpy from 0x0?!: %p %p %s\n", dest, src, sizeu1(n));
		return dest;
	}

	if(dest == NULL)
	{
		CONS_Debug(DBG_MEMORY, "Memcpy to 0x0?!: %p %p %s\n", dest, src, sizeu1(n));
		return dest;
	}

	return memcpy(dest, src, n);
}

static /*FUNCTARGET("mmx")*/ void *mmx_cpy(void *dest, const void *src, size_t n)
{
#if defined (_MSC_VER) && defined (_X86_)
	_asm
	{
		mov ecx, [n]
		mov esi, [src]
		mov edi, [dest]
		shr ecx, 6 // mit mmx: 64bytes per iteration
		jz lower_64 // if lower than 64 bytes
		loop_64: // MMX transfers multiples of 64bytes
		movq mm0,  0[ESI] // read sources
		movq mm1,  8[ESI]
		movq mm2, 16[ESI]
		movq mm3, 24[ESI]
		movq mm4, 32[ESI]
		movq mm5, 40[ESI]
		movq mm6, 48[ESI]
		movq mm7, 56[ESI]

		movq  0[EDI], mm0 // write destination
		movq  8[EDI], mm1
		movq 16[EDI], mm2
		movq 24[EDI], mm3
		movq 32[EDI], mm4
		movq 40[EDI], mm5
		movq 48[EDI], mm6
		movq 56[EDI], mm7

		add esi, 64
		add edi, 64
		dec ecx
		jnz loop_64
		emms // close mmx operation
		lower_64:// transfer rest of buffer
		mov ebx,esi
		sub ebx,src
		mov ecx,[n]
		sub ecx,ebx
		shr ecx, 3 // multiples of 8 bytes
		jz lower_8
		loop_8:
		movq  mm0, [esi] // read source
		movq [edi], mm0 // write destination
		add esi, 8
		add edi, 8
		dec ecx
		jnz loop_8
		emms // close mmx operation
		lower_8:
		mov ebx,esi
		sub ebx,src
		mov ecx,[n]
		sub ecx,ebx
		rep movsb
		mov eax, [dest] // return dest
	}
#elif defined (__GNUC__) && defined (__i386__)
	void *retval = dest;
	size_t i;

	if (n >= MMX1_MIN_LEN)
	{
		register unsigned long int delta;
		/* Align destinition to MMREG_SIZE -boundary */
		delta = ((unsigned long int)dest)&(MMX_MMREG_SIZE-1);
		if (delta)
		{
			delta=MMX_MMREG_SIZE-delta;
			n -= delta;
			small_memcpy(dest, src, delta);
		}
		i = n >> 6; /* n/64 */
		n&=63;
		for (; i>0; i--)
		{
			__asm__ __volatile__ (
				"movq (%0), %%mm0;"
				"movq 8(%0), %%mm1;"
				"movq 16(%0), %%mm2;"
				"movq 24(%0), %%mm3;"
				"movq 32(%0), %%mm4;"
				"movq 40(%0), %%mm5;"
				"movq 48(%0), %%mm6;"
				"movq 56(%0), %%mm7;"
				"movq %%mm0, (%1);"
				"movq %%mm1, 8(%1);"
				"movq %%mm2, 16(%1);"
				"movq %%mm3, 24(%1);"
				"movq %%mm4, 32(%1);"
				"movq %%mm5, 40(%1);"
				"movq %%mm6, 48(%1);"
				"movq %%mm7, 56(%1);"
			:: "r" (src), "r" (dest) : "memory");
			src = ((const unsigned char *)src) + 64;
			dest = ((unsigned char *)dest) + 64;
		}
		__asm__ __volatile__ ("emms":::"memory");
	}
	/*
	 *	Now do the tail of the block
	 */
	if (n) __memcpy(dest, src, n);
	return retval;
#else
	return cpu_cpy(dest, src, n);
#endif
}

void *(*M_Memcpy)(void* dest, const void* src, size_t n) = cpu_cpy;

/** Memcpy that uses MMX, 3DNow, MMXExt or even SSE
  * Do not use on overlapped memory, use memmove for that
  */
void M_SetupMemcpy(void)
{
#if defined (__GNUC__) && defined (__i386__)
	if (R_SSE2)
		M_Memcpy = sse_cpy;
	else if (R_MMXExt)
		M_Memcpy = mmx2_cpy;
	else if (R_3DNow)
		M_Memcpy = mmx1_cpy;
	else
#endif
	if (R_MMX)
		M_Memcpy = mmx_cpy;
#if 0
	M_Memcpy = cpu_cpy;
#endif
}