Convert m_misc.c to .cpp

2025-10-30 08:01:28 +00:00 · 2023-02-11 16:10:29 -06:00 · 2023-02-11 16:10:29 -06:00 · b9cf1316bd
commit b9cf1316bd
parent d855d96a10
3 changed files with 461 additions and 443 deletions
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@ -38,7 +38,8 @@ add_executable(SRB2SDL2 MACOSX_BUNDLE WIN32
 	m_cond.c
 	m_easing.c
 	m_fixed.c
-	m_misc.c
+	m_memcpy.c
+	m_misc.cpp
 	m_perfstats.c
 	m_random.c
 	m_queue.c
--- a/src/m_memcpy.c
+++ b/src/m_memcpy.c
@ -0,0 +1,440 @@
+// SONIC ROBO BLAST 2
+//-----------------------------------------------------------------------------
+// Copyright (C) 1993-1996 by id Software, Inc.
+// Copyright (C) 1998-2000 by DooM Legacy Team.
+// Copyright (C) 1999-2023 by Sonic Team Junior.
+//
+// This program is free software distributed under the
+// terms of the GNU General Public License, version 2.
+// See the 'LICENSE' file for more details.
+//-----------------------------------------------------------------------------
+/// \file  m_memcpy.c
+/// \brief X86 optimized implementations of M_Memcpy
+
+#include "doomdef.h"
+#include "m_misc.h"
+
+#if defined (__GNUC__) && defined (__i386__) // from libkwave, under GPL
+// Alam: note libkwave memcpy code comes from mplayer's libvo/aclib_template.c, r699
+
+/* for small memory blocks (<256 bytes) this version is faster */
+#define small_memcpy(dest,src,n)\
+{\
+register unsigned long int dummy;\
+__asm__ __volatile__(\
+	"cld\n\t"\
+	"rep; movsb"\
+	:"=&D"(dest), "=&S"(src), "=&c"(dummy)\
+	:"0" (dest), "1" (src),"2" (n)\
+	: "memory", "cc");\
+}
+
+/* linux kernel __memcpy (from: /include/asm/string.h) */
+ATTRINLINE static FUNCINLINE void *__memcpy (void *dest, const void * src, size_t n)
+{
+	int d0, d1, d2;
+
+	if ( n < 4 )
+	{
+		small_memcpy(dest, src, n);
+	}
+	else
+	{
+		__asm__ __volatile__ (
+			"rep ; movsl;"
+			"testb $2,%b4;"
+			"je 1f;"
+			"movsw;"
+			"1:\ttestb $1,%b4;"
+			"je 2f;"
+			"movsb;"
+			"2:"
+		: "=&c" (d0), "=&D" (d1), "=&S" (d2)
+		:"0" (n/4), "q" (n),"1" ((long) dest),"2" ((long) src)
+		: "memory");
+	}
+
+	return dest;
+}
+
+#define SSE_MMREG_SIZE 16
+#define MMX_MMREG_SIZE 8
+
+#define MMX1_MIN_LEN 0x800  /* 2K blocks */
+#define MIN_LEN 0x40  /* 64-byte blocks */
+
+/* SSE note: i tried to move 128 bytes a time instead of 64 but it
+didn't make any measureable difference. i'm using 64 for the sake of
+simplicity. [MF] */
+static /*FUNCTARGET("sse2")*/ void *sse_cpy(void * dest, const void * src, size_t n)
+{
+	void *retval = dest;
+	size_t i;
+
+	/* PREFETCH has effect even for MOVSB instruction ;) */
+	__asm__ __volatile__ (
+		"prefetchnta (%0);"
+		"prefetchnta 32(%0);"
+		"prefetchnta 64(%0);"
+		"prefetchnta 96(%0);"
+		"prefetchnta 128(%0);"
+		"prefetchnta 160(%0);"
+		"prefetchnta 192(%0);"
+		"prefetchnta 224(%0);"
+		"prefetchnta 256(%0);"
+		"prefetchnta 288(%0);"
+		: : "r" (src) );
+
+	if (n >= MIN_LEN)
+	{
+		register unsigned long int delta;
+		/* Align destinition to MMREG_SIZE -boundary */
+		delta = ((unsigned long int)dest)&(SSE_MMREG_SIZE-1);
+		if (delta)
+		{
+			delta=SSE_MMREG_SIZE-delta;
+			n -= delta;
+			small_memcpy(dest, src, delta);
+		}
+		i = n >> 6; /* n/64 */
+		n&=63;
+		if (((unsigned long)src) & 15)
+		/* if SRC is misaligned */
+		 for (; i>0; i--)
+		 {
+			__asm__ __volatile__ (
+				"prefetchnta 320(%0);"
+				"prefetchnta 352(%0);"
+				"movups (%0), %%xmm0;"
+				"movups 16(%0), %%xmm1;"
+				"movups 32(%0), %%xmm2;"
+				"movups 48(%0), %%xmm3;"
+				"movntps %%xmm0, (%1);"
+				"movntps %%xmm1, 16(%1);"
+				"movntps %%xmm2, 32(%1);"
+				"movntps %%xmm3, 48(%1);"
+			:: "r" (src), "r" (dest) : "memory");
+			src = (const unsigned char *)src + 64;
+			dest = (unsigned char *)dest + 64;
+		}
+		else
+			/*
+			   Only if SRC is aligned on 16-byte boundary.
+			   It allows to use movaps instead of movups, which required data
+			   to be aligned or a general-protection exception (#GP) is generated.
+			*/
+		 for (; i>0; i--)
+		 {
+			__asm__ __volatile__ (
+				"prefetchnta 320(%0);"
+				"prefetchnta 352(%0);"
+				"movaps (%0), %%xmm0;"
+				"movaps 16(%0), %%xmm1;"
+				"movaps 32(%0), %%xmm2;"
+				"movaps 48(%0), %%xmm3;"
+				"movntps %%xmm0, (%1);"
+				"movntps %%xmm1, 16(%1);"
+				"movntps %%xmm2, 32(%1);"
+				"movntps %%xmm3, 48(%1);"
+			:: "r" (src), "r" (dest) : "memory");
+			src = ((const unsigned char *)src) + 64;
+			dest = ((unsigned char *)dest) + 64;
+		}
+		/* since movntq is weakly-ordered, a "sfence"
+		 * is needed to become ordered again. */
+		__asm__ __volatile__ ("sfence":::"memory");
+		/* enables to use FPU */
+		__asm__ __volatile__ ("emms":::"memory");
+	}
+	/*
+	 *	Now do the tail of the block
+	 */
+	if (n) __memcpy(dest, src, n);
+	return retval;
+}
+
+static FUNCTARGET("mmx") void *mmx2_cpy(void *dest, const void *src, size_t n)
+{
+	void *retval = dest;
+	size_t i;
+
+	/* PREFETCH has effect even for MOVSB instruction ;) */
+	__asm__ __volatile__ (
+		"prefetchnta (%0);"
+		"prefetchnta 32(%0);"
+		"prefetchnta 64(%0);"
+		"prefetchnta 96(%0);"
+		"prefetchnta 128(%0);"
+		"prefetchnta 160(%0);"
+		"prefetchnta 192(%0);"
+		"prefetchnta 224(%0);"
+		"prefetchnta 256(%0);"
+		"prefetchnta 288(%0);"
+	: : "r" (src));
+
+	if (n >= MIN_LEN)
+	{
+		register unsigned long int delta;
+		/* Align destinition to MMREG_SIZE -boundary */
+		delta = ((unsigned long int)dest)&(MMX_MMREG_SIZE-1);
+		if (delta)
+		{
+			delta=MMX_MMREG_SIZE-delta;
+			n -= delta;
+			small_memcpy(dest, src, delta);
+		}
+		i = n >> 6; /* n/64 */
+		n&=63;
+		for (; i>0; i--)
+		{
+			__asm__ __volatile__ (
+				"prefetchnta 320(%0);"
+				"prefetchnta 352(%0);"
+				"movq (%0), %%mm0;"
+				"movq 8(%0), %%mm1;"
+				"movq 16(%0), %%mm2;"
+				"movq 24(%0), %%mm3;"
+				"movq 32(%0), %%mm4;"
+				"movq 40(%0), %%mm5;"
+				"movq 48(%0), %%mm6;"
+				"movq 56(%0), %%mm7;"
+				"movntq %%mm0, (%1);"
+				"movntq %%mm1, 8(%1);"
+				"movntq %%mm2, 16(%1);"
+				"movntq %%mm3, 24(%1);"
+				"movntq %%mm4, 32(%1);"
+				"movntq %%mm5, 40(%1);"
+				"movntq %%mm6, 48(%1);"
+				"movntq %%mm7, 56(%1);"
+			:: "r" (src), "r" (dest) : "memory");
+			src = ((const unsigned char *)src) + 64;
+			dest = ((unsigned char *)dest) + 64;
+		}
+		/* since movntq is weakly-ordered, a "sfence"
+		* is needed to become ordered again. */
+		__asm__ __volatile__ ("sfence":::"memory");
+		__asm__ __volatile__ ("emms":::"memory");
+	}
+	/*
+	 *	Now do the tail of the block
+	 */
+	if (n) __memcpy(dest, src, n);
+	return retval;
+}
+
+static FUNCTARGET("mmx") void *mmx1_cpy(void *dest, const void *src, size_t n) //3DNOW
+{
+	void *retval = dest;
+	size_t i;
+
+	/* PREFETCH has effect even for MOVSB instruction ;) */
+	__asm__ __volatile__ (
+		"prefetch (%0);"
+		"prefetch 32(%0);"
+		"prefetch 64(%0);"
+		"prefetch 96(%0);"
+		"prefetch 128(%0);"
+		"prefetch 160(%0);"
+		"prefetch 192(%0);"
+		"prefetch 224(%0);"
+		"prefetch 256(%0);"
+		"prefetch 288(%0);"
+	: : "r" (src));
+
+	if (n >= MMX1_MIN_LEN)
+	{
+		register unsigned long int delta;
+		/* Align destinition to MMREG_SIZE -boundary */
+		delta = ((unsigned long int)dest)&(MMX_MMREG_SIZE-1);
+		if (delta)
+		{
+			delta=MMX_MMREG_SIZE-delta;
+			n -= delta;
+			small_memcpy(dest, src, delta);
+		}
+		i = n >> 6; /* n/64 */
+		n&=63;
+		for (; i>0; i--)
+		{
+			__asm__ __volatile__ (
+				"prefetch 320(%0);"
+				"prefetch 352(%0);"
+				"movq (%0), %%mm0;"
+				"movq 8(%0), %%mm1;"
+				"movq 16(%0), %%mm2;"
+				"movq 24(%0), %%mm3;"
+				"movq 32(%0), %%mm4;"
+				"movq 40(%0), %%mm5;"
+				"movq 48(%0), %%mm6;"
+				"movq 56(%0), %%mm7;"
+				"movq %%mm0, (%1);"
+				"movq %%mm1, 8(%1);"
+				"movq %%mm2, 16(%1);"
+				"movq %%mm3, 24(%1);"
+				"movq %%mm4, 32(%1);"
+				"movq %%mm5, 40(%1);"
+				"movq %%mm6, 48(%1);"
+				"movq %%mm7, 56(%1);"
+			:: "r" (src), "r" (dest) : "memory");
+			src = ((const unsigned char *)src) + 64;
+			dest = ((unsigned char *)dest) + 64;
+		}
+		__asm__ __volatile__ ("femms":::"memory"); // same as mmx_cpy() but with a femms
+	}
+	/*
+	 *	Now do the tail of the block
+	 */
+	if (n) __memcpy(dest, src, n);
+	return retval;
+}
+#endif
+
+// Alam: why? memcpy may be __cdecl/_System and our code may be not the same type
+static void *cpu_cpy(void *dest, const void *src, size_t n)
+{
+	if (src == NULL)
+	{
+		CONS_Debug(DBG_MEMORY, "Memcpy from 0x0?!: %p %p %s\n", dest, src, sizeu1(n));
+		return dest;
+	}
+
+	if(dest == NULL)
+	{
+		CONS_Debug(DBG_MEMORY, "Memcpy to 0x0?!: %p %p %s\n", dest, src, sizeu1(n));
+		return dest;
+	}
+
+	return memcpy(dest, src, n);
+}
+
+static /*FUNCTARGET("mmx")*/ void *mmx_cpy(void *dest, const void *src, size_t n)
+{
+#if defined (_MSC_VER) && defined (_X86_)
+	_asm
+	{
+		mov ecx, [n]
+		mov esi, [src]
+		mov edi, [dest]
+		shr ecx, 6 // mit mmx: 64bytes per iteration
+		jz lower_64 // if lower than 64 bytes
+		loop_64: // MMX transfers multiples of 64bytes
+		movq mm0,  0[ESI] // read sources
+		movq mm1,  8[ESI]
+		movq mm2, 16[ESI]
+		movq mm3, 24[ESI]
+		movq mm4, 32[ESI]
+		movq mm5, 40[ESI]
+		movq mm6, 48[ESI]
+		movq mm7, 56[ESI]
+
+		movq  0[EDI], mm0 // write destination
+		movq  8[EDI], mm1
+		movq 16[EDI], mm2
+		movq 24[EDI], mm3
+		movq 32[EDI], mm4
+		movq 40[EDI], mm5
+		movq 48[EDI], mm6
+		movq 56[EDI], mm7
+
+		add esi, 64
+		add edi, 64
+		dec ecx
+		jnz loop_64
+		emms // close mmx operation
+		lower_64:// transfer rest of buffer
+		mov ebx,esi
+		sub ebx,src
+		mov ecx,[n]
+		sub ecx,ebx
+		shr ecx, 3 // multiples of 8 bytes
+		jz lower_8
+		loop_8:
+		movq  mm0, [esi] // read source
+		movq [edi], mm0 // write destination
+		add esi, 8
+		add edi, 8
+		dec ecx
+		jnz loop_8
+		emms // close mmx operation
+		lower_8:
+		mov ebx,esi
+		sub ebx,src
+		mov ecx,[n]
+		sub ecx,ebx
+		rep movsb
+		mov eax, [dest] // return dest
+	}
+#elif defined (__GNUC__) && defined (__i386__)
+	void *retval = dest;
+	size_t i;
+
+	if (n >= MMX1_MIN_LEN)
+	{
+		register unsigned long int delta;
+		/* Align destinition to MMREG_SIZE -boundary */
+		delta = ((unsigned long int)dest)&(MMX_MMREG_SIZE-1);
+		if (delta)
+		{
+			delta=MMX_MMREG_SIZE-delta;
+			n -= delta;
+			small_memcpy(dest, src, delta);
+		}
+		i = n >> 6; /* n/64 */
+		n&=63;
+		for (; i>0; i--)
+		{
+			__asm__ __volatile__ (
+				"movq (%0), %%mm0;"
+				"movq 8(%0), %%mm1;"
+				"movq 16(%0), %%mm2;"
+				"movq 24(%0), %%mm3;"
+				"movq 32(%0), %%mm4;"
+				"movq 40(%0), %%mm5;"
+				"movq 48(%0), %%mm6;"
+				"movq 56(%0), %%mm7;"
+				"movq %%mm0, (%1);"
+				"movq %%mm1, 8(%1);"
+				"movq %%mm2, 16(%1);"
+				"movq %%mm3, 24(%1);"
+				"movq %%mm4, 32(%1);"
+				"movq %%mm5, 40(%1);"
+				"movq %%mm6, 48(%1);"
+				"movq %%mm7, 56(%1);"
+			:: "r" (src), "r" (dest) : "memory");
+			src = ((const unsigned char *)src) + 64;
+			dest = ((unsigned char *)dest) + 64;
+		}
+		__asm__ __volatile__ ("emms":::"memory");
+	}
+	/*
+	 *	Now do the tail of the block
+	 */
+	if (n) __memcpy(dest, src, n);
+	return retval;
+#else
+	return cpu_cpy(dest, src, n);
+#endif
+}
+
+void *(*M_Memcpy)(void* dest, const void* src, size_t n) = cpu_cpy;
+
+/** Memcpy that uses MMX, 3DNow, MMXExt or even SSE
+  * Do not use on overlapped memory, use memmove for that
+  */
+void M_SetupMemcpy(void)
+{
+#if defined (__GNUC__) && defined (__i386__)
+	if (R_SSE2)
+		M_Memcpy = sse_cpy;
+	else if (R_MMXExt)
+		M_Memcpy = mmx2_cpy;
+	else if (R_3DNow)
+		M_Memcpy = mmx1_cpy;
+	else
+#endif
+	if (R_MMX)
+		M_Memcpy = mmx_cpy;
+#if 0
+	M_Memcpy = cpu_cpy;
+#endif
+}
--- a/src/m_misc.cpp
+++ b/src/m_misc.cpp
@ -8,7 +8,7 @@
 // terms of the GNU General Public License, version 2.
 // See the 'LICENSE' file for more details.
 //-----------------------------------------------------------------------------
-/// \file  m_misc.h
+/// \file  m_misc.cpp
 /// \brief Commonly used routines
 ///        Default config file, PCX screenshots, file i/o

@ -23,6 +23,7 @@
 #include <unistd.h>
 #endif

+#include <algorithm>
 #include <errno.h>

 // Extended map support.
@ -308,7 +309,7 @@ size_t FIL_ReadFileTag(char const *name, UINT8 **buffer, INT32 tag)
 	length = ftell(handle);
 	fseek(handle,0,SEEK_SET);

-	buf = Z_Malloc(length + 1, tag, NULL);
+	buf = static_cast<UINT8*>(Z_Malloc(length + 1, tag, NULL));
 	count = fread(buf, 1, length, handle);
 	fclose(handle);

@ -774,8 +775,8 @@ static void M_CreateScreenShotPalette(void)
 	for (i = 0, j = 0; i < 768; i += 3, j++)
 	{
 		RGBA_t locpal = ((cv_screenshot_colorprofile.value)
-		? pLocalPalette[(max(st_palette,0)*256)+j]
-		: pMasterPalette[(max(st_palette,0)*256)+j]);
+		? pLocalPalette[(std::max(st_palette,0)*256)+j]
+		: pMasterPalette[(std::max(st_palette,0)*256)+j]);
 		screenshot_palette[i] = locpal.s.red;
 		screenshot_palette[i+1] = locpal.s.green;
 		screenshot_palette[i+2] = locpal.s.blue;
@ -854,7 +855,7 @@ static void M_PNGhdr(png_structp png_ptr, png_infop png_info_ptr, PNG_CONST png_
 	const png_byte png_interlace = PNG_INTERLACE_NONE; //PNG_INTERLACE_ADAM7
 	if (palette)
 	{
-		png_colorp png_PLTE = png_malloc(png_ptr, sizeof(png_color)*256); //palette
+		png_colorp png_PLTE = static_cast<png_colorp>(png_malloc(png_ptr, sizeof(png_color)*256)); //palette
 		png_uint_16 i;

 		const png_byte *pal = palette;
@ -975,8 +976,8 @@ static void M_PNGText(png_structp png_ptr, png_infop png_info_ptr, PNG_CONST png

 static inline void M_PNGImage(png_structp png_ptr, png_infop png_info_ptr, PNG_CONST png_uint_32 height, png_bytep png_buf)
 {
-	png_uint_32 pitch = png_get_rowbytes(png_ptr, png_info_ptr);
-	png_bytepp row_pointers = png_malloc(png_ptr, height* sizeof (png_bytep));
+	png_uint_32 pitch = png_get_rowbytes(png_ptr, static_cast<const png_info*>(png_info_ptr));
+	png_bytepp row_pointers = static_cast<png_bytepp>(png_malloc(png_ptr, height* sizeof (png_bytep)));
 	png_uint_32 y;
 	for (y = 0; y < height; y++)
 	{
@ -1579,7 +1580,7 @@ boolean M_SavePNG(const char *filename, void *data, int width, int height, const

 	png_write_info(png_ptr, png_info_ptr);

-	M_PNGImage(png_ptr, png_info_ptr, height, data);
+	M_PNGImage(png_ptr, png_info_ptr, height, static_cast<png_bytep>(data));

 	png_write_end(png_ptr, png_info_ptr);
 	png_destroy_write_struct(&png_ptr, &png_info_ptr);
@ -2494,430 +2495,6 @@ char *sizeu5(size_t num)
 	return sizeu5_buf;
 }

-#if defined (__GNUC__) && defined (__i386__) // from libkwave, under GPL
-// Alam: note libkwave memcpy code comes from mplayer's libvo/aclib_template.c, r699
-
-/* for small memory blocks (<256 bytes) this version is faster */
-#define small_memcpy(dest,src,n)\
-{\
-register unsigned long int dummy;\
-__asm__ __volatile__(\
-	"cld\n\t"\
-	"rep; movsb"\
-	:"=&D"(dest), "=&S"(src), "=&c"(dummy)\
-	:"0" (dest), "1" (src),"2" (n)\
-	: "memory", "cc");\
-}
-/* linux kernel __memcpy (from: /include/asm/string.h) */
-ATTRINLINE static FUNCINLINE void *__memcpy (void *dest, const void * src, size_t n)
-{
-	int d0, d1, d2;
-
-	if ( n < 4 )
-	{
-		small_memcpy(dest, src, n);
-	}
-	else
-	{
-		__asm__ __volatile__ (
-			"rep ; movsl;"
-			"testb $2,%b4;"
-			"je 1f;"
-			"movsw;"
-			"1:\ttestb $1,%b4;"
-			"je 2f;"
-			"movsb;"
-			"2:"
-		: "=&c" (d0), "=&D" (d1), "=&S" (d2)
-		:"0" (n/4), "q" (n),"1" ((long) dest),"2" ((long) src)
-		: "memory");
-	}
-
-	return dest;
-}
-
-#define SSE_MMREG_SIZE 16
-#define MMX_MMREG_SIZE 8
-
-#define MMX1_MIN_LEN 0x800  /* 2K blocks */
-#define MIN_LEN 0x40  /* 64-byte blocks */
-
-/* SSE note: i tried to move 128 bytes a time instead of 64 but it
-didn't make any measureable difference. i'm using 64 for the sake of
-simplicity. [MF] */
-static /*FUNCTARGET("sse2")*/ void *sse_cpy(void * dest, const void * src, size_t n)
-{
-	void *retval = dest;
-	size_t i;
-
-	/* PREFETCH has effect even for MOVSB instruction ;) */
-	__asm__ __volatile__ (
-		"prefetchnta (%0);"
-		"prefetchnta 32(%0);"
-		"prefetchnta 64(%0);"
-		"prefetchnta 96(%0);"
-		"prefetchnta 128(%0);"
-		"prefetchnta 160(%0);"
-		"prefetchnta 192(%0);"
-		"prefetchnta 224(%0);"
-		"prefetchnta 256(%0);"
-		"prefetchnta 288(%0);"
-		: : "r" (src) );
-
-	if (n >= MIN_LEN)
-	{
-		register unsigned long int delta;
-		/* Align destinition to MMREG_SIZE -boundary */
-		delta = ((unsigned long int)dest)&(SSE_MMREG_SIZE-1);
-		if (delta)
-		{
-			delta=SSE_MMREG_SIZE-delta;
-			n -= delta;
-			small_memcpy(dest, src, delta);
-		}
-		i = n >> 6; /* n/64 */
-		n&=63;
-		if (((unsigned long)src) & 15)
-		/* if SRC is misaligned */
-		 for (; i>0; i--)
-		 {
-			__asm__ __volatile__ (
-				"prefetchnta 320(%0);"
-				"prefetchnta 352(%0);"
-				"movups (%0), %%xmm0;"
-				"movups 16(%0), %%xmm1;"
-				"movups 32(%0), %%xmm2;"
-				"movups 48(%0), %%xmm3;"
-				"movntps %%xmm0, (%1);"
-				"movntps %%xmm1, 16(%1);"
-				"movntps %%xmm2, 32(%1);"
-				"movntps %%xmm3, 48(%1);"
-			:: "r" (src), "r" (dest) : "memory");
-			src = (const unsigned char *)src + 64;
-			dest = (unsigned char *)dest + 64;
-		}
-		else
-			/*
-			   Only if SRC is aligned on 16-byte boundary.
-			   It allows to use movaps instead of movups, which required data
-			   to be aligned or a general-protection exception (#GP) is generated.
-			*/
-		 for (; i>0; i--)
-		 {
-			__asm__ __volatile__ (
-				"prefetchnta 320(%0);"
-				"prefetchnta 352(%0);"
-				"movaps (%0), %%xmm0;"
-				"movaps 16(%0), %%xmm1;"
-				"movaps 32(%0), %%xmm2;"
-				"movaps 48(%0), %%xmm3;"
-				"movntps %%xmm0, (%1);"
-				"movntps %%xmm1, 16(%1);"
-				"movntps %%xmm2, 32(%1);"
-				"movntps %%xmm3, 48(%1);"
-			:: "r" (src), "r" (dest) : "memory");
-			src = ((const unsigned char *)src) + 64;
-			dest = ((unsigned char *)dest) + 64;
-		}
-		/* since movntq is weakly-ordered, a "sfence"
-		 * is needed to become ordered again. */
-		__asm__ __volatile__ ("sfence":::"memory");
-		/* enables to use FPU */
-		__asm__ __volatile__ ("emms":::"memory");
-	}
-	/*
-	 *	Now do the tail of the block
-	 */
-	if (n) __memcpy(dest, src, n);
-	return retval;
-}
-
-static FUNCTARGET("mmx") void *mmx2_cpy(void *dest, const void *src, size_t n)
-{
-	void *retval = dest;
-	size_t i;
-
-	/* PREFETCH has effect even for MOVSB instruction ;) */
-	__asm__ __volatile__ (
-		"prefetchnta (%0);"
-		"prefetchnta 32(%0);"
-		"prefetchnta 64(%0);"
-		"prefetchnta 96(%0);"
-		"prefetchnta 128(%0);"
-		"prefetchnta 160(%0);"
-		"prefetchnta 192(%0);"
-		"prefetchnta 224(%0);"
-		"prefetchnta 256(%0);"
-		"prefetchnta 288(%0);"
-	: : "r" (src));
-
-	if (n >= MIN_LEN)
-	{
-		register unsigned long int delta;
-		/* Align destinition to MMREG_SIZE -boundary */
-		delta = ((unsigned long int)dest)&(MMX_MMREG_SIZE-1);
-		if (delta)
-		{
-			delta=MMX_MMREG_SIZE-delta;
-			n -= delta;
-			small_memcpy(dest, src, delta);
-		}
-		i = n >> 6; /* n/64 */
-		n&=63;
-		for (; i>0; i--)
-		{
-			__asm__ __volatile__ (
-				"prefetchnta 320(%0);"
-				"prefetchnta 352(%0);"
-				"movq (%0), %%mm0;"
-				"movq 8(%0), %%mm1;"
-				"movq 16(%0), %%mm2;"
-				"movq 24(%0), %%mm3;"
-				"movq 32(%0), %%mm4;"
-				"movq 40(%0), %%mm5;"
-				"movq 48(%0), %%mm6;"
-				"movq 56(%0), %%mm7;"
-				"movntq %%mm0, (%1);"
-				"movntq %%mm1, 8(%1);"
-				"movntq %%mm2, 16(%1);"
-				"movntq %%mm3, 24(%1);"
-				"movntq %%mm4, 32(%1);"
-				"movntq %%mm5, 40(%1);"
-				"movntq %%mm6, 48(%1);"
-				"movntq %%mm7, 56(%1);"
-			:: "r" (src), "r" (dest) : "memory");
-			src = ((const unsigned char *)src) + 64;
-			dest = ((unsigned char *)dest) + 64;
-		}
-		/* since movntq is weakly-ordered, a "sfence"
-		* is needed to become ordered again. */
-		__asm__ __volatile__ ("sfence":::"memory");
-		__asm__ __volatile__ ("emms":::"memory");
-	}
-	/*
-	 *	Now do the tail of the block
-	 */
-	if (n) __memcpy(dest, src, n);
-	return retval;
-}
-
-static FUNCTARGET("mmx") void *mmx1_cpy(void *dest, const void *src, size_t n) //3DNOW
-{
-	void *retval = dest;
-	size_t i;
-
-	/* PREFETCH has effect even for MOVSB instruction ;) */
-	__asm__ __volatile__ (
-		"prefetch (%0);"
-		"prefetch 32(%0);"
-		"prefetch 64(%0);"
-		"prefetch 96(%0);"
-		"prefetch 128(%0);"
-		"prefetch 160(%0);"
-		"prefetch 192(%0);"
-		"prefetch 224(%0);"
-		"prefetch 256(%0);"
-		"prefetch 288(%0);"
-	: : "r" (src));
-
-	if (n >= MMX1_MIN_LEN)
-	{
-		register unsigned long int delta;
-		/* Align destinition to MMREG_SIZE -boundary */
-		delta = ((unsigned long int)dest)&(MMX_MMREG_SIZE-1);
-		if (delta)
-		{
-			delta=MMX_MMREG_SIZE-delta;
-			n -= delta;
-			small_memcpy(dest, src, delta);
-		}
-		i = n >> 6; /* n/64 */
-		n&=63;
-		for (; i>0; i--)
-		{
-			__asm__ __volatile__ (
-				"prefetch 320(%0);"
-				"prefetch 352(%0);"
-				"movq (%0), %%mm0;"
-				"movq 8(%0), %%mm1;"
-				"movq 16(%0), %%mm2;"
-				"movq 24(%0), %%mm3;"
-				"movq 32(%0), %%mm4;"
-				"movq 40(%0), %%mm5;"
-				"movq 48(%0), %%mm6;"
-				"movq 56(%0), %%mm7;"
-				"movq %%mm0, (%1);"
-				"movq %%mm1, 8(%1);"
-				"movq %%mm2, 16(%1);"
-				"movq %%mm3, 24(%1);"
-				"movq %%mm4, 32(%1);"
-				"movq %%mm5, 40(%1);"
-				"movq %%mm6, 48(%1);"
-				"movq %%mm7, 56(%1);"
-			:: "r" (src), "r" (dest) : "memory");
-			src = ((const unsigned char *)src) + 64;
-			dest = ((unsigned char *)dest) + 64;
-		}
-		__asm__ __volatile__ ("femms":::"memory"); // same as mmx_cpy() but with a femms
-	}
-	/*
-	 *	Now do the tail of the block
-	 */
-	if (n) __memcpy(dest, src, n);
-	return retval;
-}
-#endif
-
-// Alam: why? memcpy may be __cdecl/_System and our code may be not the same type
-static void *cpu_cpy(void *dest, const void *src, size_t n)
-{
-	if (src == NULL)
-	{
-		CONS_Debug(DBG_MEMORY, "Memcpy from 0x0?!: %p %p %s\n", dest, src, sizeu1(n));
-		return dest;
-	}
-
-	if(dest == NULL)
-	{
-		CONS_Debug(DBG_MEMORY, "Memcpy to 0x0?!: %p %p %s\n", dest, src, sizeu1(n));
-		return dest;
-	}
-
-	return memcpy(dest, src, n);
-}
-
-static /*FUNCTARGET("mmx")*/ void *mmx_cpy(void *dest, const void *src, size_t n)
-{
-#if defined (_MSC_VER) && defined (_X86_)
-	_asm
-	{
-		mov ecx, [n]
-		mov esi, [src]
-		mov edi, [dest]
-		shr ecx, 6 // mit mmx: 64bytes per iteration
-		jz lower_64 // if lower than 64 bytes
-		loop_64: // MMX transfers multiples of 64bytes
-		movq mm0,  0[ESI] // read sources
-		movq mm1,  8[ESI]
-		movq mm2, 16[ESI]
-		movq mm3, 24[ESI]
-		movq mm4, 32[ESI]
-		movq mm5, 40[ESI]
-		movq mm6, 48[ESI]
-		movq mm7, 56[ESI]
-
-		movq  0[EDI], mm0 // write destination
-		movq  8[EDI], mm1
-		movq 16[EDI], mm2
-		movq 24[EDI], mm3
-		movq 32[EDI], mm4
-		movq 40[EDI], mm5
-		movq 48[EDI], mm6
-		movq 56[EDI], mm7
-
-		add esi, 64
-		add edi, 64
-		dec ecx
-		jnz loop_64
-		emms // close mmx operation
-		lower_64:// transfer rest of buffer
-		mov ebx,esi
-		sub ebx,src
-		mov ecx,[n]
-		sub ecx,ebx
-		shr ecx, 3 // multiples of 8 bytes
-		jz lower_8
-		loop_8:
-		movq  mm0, [esi] // read source
-		movq [edi], mm0 // write destination
-		add esi, 8
-		add edi, 8
-		dec ecx
-		jnz loop_8
-		emms // close mmx operation
-		lower_8:
-		mov ebx,esi
-		sub ebx,src
-		mov ecx,[n]
-		sub ecx,ebx
-		rep movsb
-		mov eax, [dest] // return dest
-	}
-#elif defined (__GNUC__) && defined (__i386__)
-	void *retval = dest;
-	size_t i;
-
-	if (n >= MMX1_MIN_LEN)
-	{
-		register unsigned long int delta;
-		/* Align destinition to MMREG_SIZE -boundary */
-		delta = ((unsigned long int)dest)&(MMX_MMREG_SIZE-1);
-		if (delta)
-		{
-			delta=MMX_MMREG_SIZE-delta;
-			n -= delta;
-			small_memcpy(dest, src, delta);
-		}
-		i = n >> 6; /* n/64 */
-		n&=63;
-		for (; i>0; i--)
-		{
-			__asm__ __volatile__ (
-				"movq (%0), %%mm0;"
-				"movq 8(%0), %%mm1;"
-				"movq 16(%0), %%mm2;"
-				"movq 24(%0), %%mm3;"
-				"movq 32(%0), %%mm4;"
-				"movq 40(%0), %%mm5;"
-				"movq 48(%0), %%mm6;"
-				"movq 56(%0), %%mm7;"
-				"movq %%mm0, (%1);"
-				"movq %%mm1, 8(%1);"
-				"movq %%mm2, 16(%1);"
-				"movq %%mm3, 24(%1);"
-				"movq %%mm4, 32(%1);"
-				"movq %%mm5, 40(%1);"
-				"movq %%mm6, 48(%1);"
-				"movq %%mm7, 56(%1);"
-			:: "r" (src), "r" (dest) : "memory");
-			src = ((const unsigned char *)src) + 64;
-			dest = ((unsigned char *)dest) + 64;
-		}
-		__asm__ __volatile__ ("emms":::"memory");
-	}
-	/*
-	 *	Now do the tail of the block
-	 */
-	if (n) __memcpy(dest, src, n);
-	return retval;
-#else
-	return cpu_cpy(dest, src, n);
-#endif
-}
-
-void *(*M_Memcpy)(void* dest, const void* src, size_t n) = cpu_cpy;
-
-/** Memcpy that uses MMX, 3DNow, MMXExt or even SSE
-  * Do not use on overlapped memory, use memmove for that
-  */
-void M_SetupMemcpy(void)
-{
-#if defined (__GNUC__) && defined (__i386__)
-	if (R_SSE2)
-		M_Memcpy = sse_cpy;
-	else if (R_MMXExt)
-		M_Memcpy = mmx2_cpy;
-	else if (R_3DNow)
-		M_Memcpy = mmx1_cpy;
-	else
-#endif
-	if (R_MMX)
-		M_Memcpy = mmx_cpy;
-#if 0
-	M_Memcpy = cpu_cpy;
-#endif
-}
-
 /** Return the appropriate message for a file error or end of file.
 */
 const char *M_FileError(FILE *fp)