From 24132a9dcda2fa1c37ae735f53a8cf30a2369601 Mon Sep 17 00:00:00 2001 From: Sally Coolatta Date: Tue, 26 Dec 2023 03:00:05 -0500 Subject: [PATCH] Use C++ templates for DrawColumn/Span Two reasons: - Makes it more straight-forward to add brightmaps to the non-power-of-two rendering functions. - Made it easier to split off brightmap rendering. Hopefully improves performance, but I haven't thoroughly tested this. --- src/CMakeLists.txt | 2 +- src/libdivide.h | 2484 +++++++++++++++++++++++--------- src/{r_draw.c => r_draw.cpp} | 52 +- src/r_draw.h | 175 +-- src/r_draw16.c | 214 --- src/r_draw8.c | 2564 ---------------------------------- src/r_draw8_flat.c | 80 -- src/r_draw8_npo2.c | 1618 --------------------- src/r_draw_column.cpp | 413 ++++++ src/r_draw_span.cpp | 866 ++++++++++++ src/r_plane.cpp | 76 +- src/r_segs.cpp | 2 +- src/r_splats.c | 12 - src/screen.c | 244 ++-- 14 files changed, 3336 insertions(+), 5466 deletions(-) rename src/{r_draw.c => r_draw.cpp} (92%) delete mode 100644 src/r_draw16.c delete mode 100644 src/r_draw8.c delete mode 100644 src/r_draw8_flat.c delete mode 100644 src/r_draw8_npo2.c create mode 100644 src/r_draw_column.cpp create mode 100644 src/r_draw_span.cpp diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index af7b84605..045452975 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -72,7 +72,7 @@ add_executable(SRB2SDL2 MACOSX_BUNDLE WIN32 r_data.c r_debug.cpp r_debug_parser.cpp - r_draw.c + r_draw.cpp r_fps.c r_main.cpp r_plane.cpp diff --git a/src/libdivide.h b/src/libdivide.h index 1a589c7e5..96dd27211 100644 --- a/src/libdivide.h +++ b/src/libdivide.h @@ -1,124 +1,112 @@ // libdivide.h - Optimized integer division // https://libdivide.com // -// Copyright (C) 2010 - 2019 ridiculous_fish, -// Copyright (C) 2016 - 2019 Kim Walisch, +// Copyright (C) 2010 - 2022 ridiculous_fish, +// Copyright (C) 2016 - 2022 Kim Walisch, // // libdivide is dual-licensed under the Boost or zlib licenses. // You may use libdivide under the terms of either of these. -// See LICENSE.txt in the libdivide source code repository for more details. - - -// NOTICE: This is an altered source version of libdivide. -// Libdivide is used here under the terms of the zlib license. -// Here is the zlib license text from https://github.com/ridiculousfish/libdivide/blob/master/LICENSE.txt -/* - zlib License - ------------ - - Copyright (C) 2010 - 2019 ridiculous_fish, - Copyright (C) 2016 - 2019 Kim Walisch, - - This software is provided 'as-is', without any express or implied - warranty. In no event will the authors be held liable for any damages - arising from the use of this software. - - Permission is granted to anyone to use this software for any purpose, - including commercial applications, and to alter it and redistribute it - freely, subject to the following restrictions: - - 1. The origin of this software must not be misrepresented; you must not - claim that you wrote the original software. If you use this software - in a product, an acknowledgment in the product documentation would be - appreciated but is not required. - 2. Altered source versions must be plainly marked as such, and must not be - misrepresented as being the original software. - 3. This notice may not be removed or altered from any source distribution. -*/ - - -// This version of libdivide has been modified for use with SRB2. -// Changes made include: -// - unused parts commented out (to avoid the need to fix C90 compilation issues with them) -// - C90 compilation issues fixed with used parts -// - use I_Error for errors +// See LICENSE.txt for more details. #ifndef LIBDIVIDE_H #define LIBDIVIDE_H -#define LIBDIVIDE_VERSION "3.0" -#define LIBDIVIDE_VERSION_MAJOR 3 +#define LIBDIVIDE_VERSION "5.0" +#define LIBDIVIDE_VERSION_MAJOR 5 #define LIBDIVIDE_VERSION_MINOR 0 #include - -#if defined(__cplusplus) - #include - #include - #include -#else - #include - #include +#if !defined(__AVR__) +#include +#include #endif -#if defined(LIBDIVIDE_AVX512) - #include -#elif defined(LIBDIVIDE_AVX2) - #include -#elif defined(LIBDIVIDE_SSE2) - #include +#if defined(LIBDIVIDE_SSE2) +#include +#endif +#if defined(LIBDIVIDE_AVX2) || defined(LIBDIVIDE_AVX512) +#include +#endif +#if defined(LIBDIVIDE_NEON) +#include #endif #if defined(_MSC_VER) - #include - // disable warning C4146: unary minus operator applied - // to unsigned type, result still unsigned - #pragma warning(disable: 4146) - #define LIBDIVIDE_VC +#include +#pragma warning(push) +// disable warning C4146: unary minus operator applied +// to unsigned type, result still unsigned +#pragma warning(disable : 4146) +// disable warning C4204: nonstandard extension used : non-constant aggregate +// initializer +// +// It's valid C99 +#pragma warning(disable : 4204) +#define LIBDIVIDE_VC #endif #if !defined(__has_builtin) - #define __has_builtin(x) 0 +#define __has_builtin(x) 0 #endif #if defined(__SIZEOF_INT128__) - #define HAS_INT128_T - // clang-cl on Windows does not yet support 128-bit division - #if !(defined(__clang__) && defined(LIBDIVIDE_VC)) - #define HAS_INT128_DIV - #endif +#define HAS_INT128_T +// clang-cl on Windows does not yet support 128-bit division +#if !(defined(__clang__) && defined(LIBDIVIDE_VC)) +#define HAS_INT128_DIV +#endif #endif #if defined(__x86_64__) || defined(_M_X64) - #define LIBDIVIDE_X86_64 +#define LIBDIVIDE_X86_64 #endif #if defined(__i386__) - #define LIBDIVIDE_i386 +#define LIBDIVIDE_i386 #endif #if defined(__GNUC__) || defined(__clang__) - #define LIBDIVIDE_GCC_STYLE_ASM +#define LIBDIVIDE_GCC_STYLE_ASM #endif #if defined(__cplusplus) || defined(LIBDIVIDE_VC) - #define LIBDIVIDE_FUNCTION __FUNCTION__ +#define LIBDIVIDE_FUNCTION __FUNCTION__ #else - #define LIBDIVIDE_FUNCTION __func__ +#define LIBDIVIDE_FUNCTION __func__ #endif -#define LIBDIVIDE_ERROR(msg) \ - I_Error("libdivide.h:%d: %s(): Error: %s\n", \ - __LINE__, LIBDIVIDE_FUNCTION, msg); +// Set up forced inlining if possible. +// We need both the attribute and keyword to avoid "might not be inlineable" warnings. +#ifdef __has_attribute +#if __has_attribute(always_inline) +#define LIBDIVIDE_INLINE __attribute__((always_inline)) inline +#endif +#endif +#ifndef LIBDIVIDE_INLINE +#define LIBDIVIDE_INLINE inline +#endif -#if defined(LIBDIVIDE_ASSERTIONS_ON) - #define LIBDIVIDE_ASSERT(x) \ - if (!(x)) { \ - I_Error("libdivide.h:%d: %s(): Assertion failed: %s\n", \ - __LINE__, LIBDIVIDE_FUNCTION, #x); \ - } +#if defined(__AVR__) +#define LIBDIVIDE_ERROR(msg) #else - #define LIBDIVIDE_ASSERT(x) +#define LIBDIVIDE_ERROR(msg) \ + do { \ + fprintf(stderr, "libdivide.h:%d: %s(): Error: %s\n", __LINE__, LIBDIVIDE_FUNCTION, msg); \ + abort(); \ + } while (0) +#endif + +#if defined(LIBDIVIDE_ASSERTIONS_ON) && !defined(__AVR__) +#define LIBDIVIDE_ASSERT(x) \ + do { \ + if (!(x)) { \ + fprintf(stderr, "libdivide.h:%d: %s(): Assertion failed: %s\n", __LINE__, \ + LIBDIVIDE_FUNCTION, #x); \ + abort(); \ + } \ + } while (0) +#else +#define LIBDIVIDE_ASSERT(x) #endif #ifdef __cplusplus @@ -131,6 +119,16 @@ namespace libdivide { // by up to 10% because of reduced memory bandwidth. #pragma pack(push, 1) +struct libdivide_u16_t { + uint16_t magic; + uint8_t more; +}; + +struct libdivide_s16_t { + int16_t magic; + uint8_t more; +}; + struct libdivide_u32_t { uint32_t magic; uint8_t more; @@ -151,6 +149,16 @@ struct libdivide_s64_t { uint8_t more; }; +struct libdivide_u16_branchfree_t { + uint16_t magic; + uint8_t more; +}; + +struct libdivide_s16_branchfree_t { + int16_t magic; + uint8_t more; +}; + struct libdivide_u32_branchfree_t { uint32_t magic; uint8_t more; @@ -206,60 +214,105 @@ struct libdivide_s64_branchfree_t { // whether the divisor is negated. In branchfree strategy, it is not negated. enum { + LIBDIVIDE_16_SHIFT_MASK = 0x1F, LIBDIVIDE_32_SHIFT_MASK = 0x1F, LIBDIVIDE_64_SHIFT_MASK = 0x3F, LIBDIVIDE_ADD_MARKER = 0x40, LIBDIVIDE_NEGATIVE_DIVISOR = 0x80 }; -//static inline struct libdivide_s32_t libdivide_s32_gen(int32_t d); -static inline struct libdivide_u32_t libdivide_u32_gen(uint32_t d); -//static inline struct libdivide_s64_t libdivide_s64_gen(int64_t d); -//static inline struct libdivide_u64_t libdivide_u64_gen(uint64_t d); +static LIBDIVIDE_INLINE struct libdivide_s16_t libdivide_s16_gen(int16_t d); +static LIBDIVIDE_INLINE struct libdivide_u16_t libdivide_u16_gen(uint16_t d); +static LIBDIVIDE_INLINE struct libdivide_s32_t libdivide_s32_gen(int32_t d); +static LIBDIVIDE_INLINE struct libdivide_u32_t libdivide_u32_gen(uint32_t d); +static LIBDIVIDE_INLINE struct libdivide_s64_t libdivide_s64_gen(int64_t d); +static LIBDIVIDE_INLINE struct libdivide_u64_t libdivide_u64_gen(uint64_t d); -/*static inline struct libdivide_s32_branchfree_t libdivide_s32_branchfree_gen(int32_t d); -static inline struct libdivide_u32_branchfree_t libdivide_u32_branchfree_gen(uint32_t d); -static inline struct libdivide_s64_branchfree_t libdivide_s64_branchfree_gen(int64_t d); -static inline struct libdivide_u64_branchfree_t libdivide_u64_branchfree_gen(uint64_t d);*/ +static LIBDIVIDE_INLINE struct libdivide_s16_branchfree_t libdivide_s16_branchfree_gen(int16_t d); +static LIBDIVIDE_INLINE struct libdivide_u16_branchfree_t libdivide_u16_branchfree_gen(uint16_t d); +static LIBDIVIDE_INLINE struct libdivide_s32_branchfree_t libdivide_s32_branchfree_gen(int32_t d); +static LIBDIVIDE_INLINE struct libdivide_u32_branchfree_t libdivide_u32_branchfree_gen(uint32_t d); +static LIBDIVIDE_INLINE struct libdivide_s64_branchfree_t libdivide_s64_branchfree_gen(int64_t d); +static LIBDIVIDE_INLINE struct libdivide_u64_branchfree_t libdivide_u64_branchfree_gen(uint64_t d); -//static inline int32_t libdivide_s32_do(int32_t numer, const struct libdivide_s32_t *denom); -static inline uint32_t libdivide_u32_do(uint32_t numer, const struct libdivide_u32_t *denom); -//static inline int64_t libdivide_s64_do(int64_t numer, const struct libdivide_s64_t *denom); -//static inline uint64_t libdivide_u64_do(uint64_t numer, const struct libdivide_u64_t *denom); +static LIBDIVIDE_INLINE int16_t libdivide_s16_do_raw(int16_t numer, int16_t magic, uint8_t more); +static LIBDIVIDE_INLINE int16_t libdivide_s16_do( + int16_t numer, const struct libdivide_s16_t *denom); +static LIBDIVIDE_INLINE uint16_t libdivide_u16_do_raw(uint16_t numer, uint16_t magic, uint8_t more); +static LIBDIVIDE_INLINE uint16_t libdivide_u16_do( + uint16_t numer, const struct libdivide_u16_t *denom); +static LIBDIVIDE_INLINE int32_t libdivide_s32_do( + int32_t numer, const struct libdivide_s32_t *denom); +static LIBDIVIDE_INLINE uint32_t libdivide_u32_do( + uint32_t numer, const struct libdivide_u32_t *denom); +static LIBDIVIDE_INLINE int64_t libdivide_s64_do( + int64_t numer, const struct libdivide_s64_t *denom); +static LIBDIVIDE_INLINE uint64_t libdivide_u64_do( + uint64_t numer, const struct libdivide_u64_t *denom); -/*static inline int32_t libdivide_s32_branchfree_do(int32_t numer, const struct libdivide_s32_branchfree_t *denom); -static inline uint32_t libdivide_u32_branchfree_do(uint32_t numer, const struct libdivide_u32_branchfree_t *denom); -static inline int64_t libdivide_s64_branchfree_do(int64_t numer, const struct libdivide_s64_branchfree_t *denom); -static inline uint64_t libdivide_u64_branchfree_do(uint64_t numer, const struct libdivide_u64_branchfree_t *denom);*/ +static LIBDIVIDE_INLINE int16_t libdivide_s16_branchfree_do( + int16_t numer, const struct libdivide_s16_branchfree_t *denom); +static LIBDIVIDE_INLINE uint16_t libdivide_u16_branchfree_do( + uint16_t numer, const struct libdivide_u16_branchfree_t *denom); +static LIBDIVIDE_INLINE int32_t libdivide_s32_branchfree_do( + int32_t numer, const struct libdivide_s32_branchfree_t *denom); +static LIBDIVIDE_INLINE uint32_t libdivide_u32_branchfree_do( + uint32_t numer, const struct libdivide_u32_branchfree_t *denom); +static LIBDIVIDE_INLINE int64_t libdivide_s64_branchfree_do( + int64_t numer, const struct libdivide_s64_branchfree_t *denom); +static LIBDIVIDE_INLINE uint64_t libdivide_u64_branchfree_do( + uint64_t numer, const struct libdivide_u64_branchfree_t *denom); -/*static inline int32_t libdivide_s32_recover(const struct libdivide_s32_t *denom); -static inline uint32_t libdivide_u32_recover(const struct libdivide_u32_t *denom); -static inline int64_t libdivide_s64_recover(const struct libdivide_s64_t *denom); -static inline uint64_t libdivide_u64_recover(const struct libdivide_u64_t *denom);*/ +static LIBDIVIDE_INLINE int16_t libdivide_s16_recover(const struct libdivide_s16_t *denom); +static LIBDIVIDE_INLINE uint16_t libdivide_u16_recover(const struct libdivide_u16_t *denom); +static LIBDIVIDE_INLINE int32_t libdivide_s32_recover(const struct libdivide_s32_t *denom); +static LIBDIVIDE_INLINE uint32_t libdivide_u32_recover(const struct libdivide_u32_t *denom); +static LIBDIVIDE_INLINE int64_t libdivide_s64_recover(const struct libdivide_s64_t *denom); +static LIBDIVIDE_INLINE uint64_t libdivide_u64_recover(const struct libdivide_u64_t *denom); -/*static inline int32_t libdivide_s32_branchfree_recover(const struct libdivide_s32_branchfree_t *denom); -static inline uint32_t libdivide_u32_branchfree_recover(const struct libdivide_u32_branchfree_t *denom); -static inline int64_t libdivide_s64_branchfree_recover(const struct libdivide_s64_branchfree_t *denom); -static inline uint64_t libdivide_u64_branchfree_recover(const struct libdivide_u64_branchfree_t *denom);*/ +static LIBDIVIDE_INLINE int16_t libdivide_s16_branchfree_recover( + const struct libdivide_s16_branchfree_t *denom); +static LIBDIVIDE_INLINE uint16_t libdivide_u16_branchfree_recover( + const struct libdivide_u16_branchfree_t *denom); +static LIBDIVIDE_INLINE int32_t libdivide_s32_branchfree_recover( + const struct libdivide_s32_branchfree_t *denom); +static LIBDIVIDE_INLINE uint32_t libdivide_u32_branchfree_recover( + const struct libdivide_u32_branchfree_t *denom); +static LIBDIVIDE_INLINE int64_t libdivide_s64_branchfree_recover( + const struct libdivide_s64_branchfree_t *denom); +static LIBDIVIDE_INLINE uint64_t libdivide_u64_branchfree_recover( + const struct libdivide_u64_branchfree_t *denom); //////// Internal Utility Functions -static inline uint32_t libdivide_mullhi_u32(uint32_t x, uint32_t y) { +static LIBDIVIDE_INLINE uint16_t libdivide_mullhi_u16(uint16_t x, uint16_t y) { + uint32_t xl = x, yl = y; + uint32_t rl = xl * yl; + return (uint16_t)(rl >> 16); +} + +static LIBDIVIDE_INLINE int16_t libdivide_mullhi_s16(int16_t x, int16_t y) { + int32_t xl = x, yl = y; + int32_t rl = xl * yl; + // needs to be arithmetic shift + return (int16_t)(rl >> 16); +} + +static LIBDIVIDE_INLINE uint32_t libdivide_mullhi_u32(uint32_t x, uint32_t y) { uint64_t xl = x, yl = y; uint64_t rl = xl * yl; return (uint32_t)(rl >> 32); } -static inline int32_t libdivide_mullhi_s32(int32_t x, int32_t y) { +static LIBDIVIDE_INLINE int32_t libdivide_mullhi_s32(int32_t x, int32_t y) { int64_t xl = x, yl = y; int64_t rl = xl * yl; // needs to be arithmetic shift return (int32_t)(rl >> 32); } -static inline uint64_t libdivide_mullhi_u64(uint64_t x, uint64_t y) { -#if defined(LIBDIVIDE_VC) && \ - defined(LIBDIVIDE_X86_64) +static LIBDIVIDE_INLINE uint64_t libdivide_mullhi_u64(uint64_t x, uint64_t y) { +#if defined(LIBDIVIDE_VC) && defined(LIBDIVIDE_X86_64) return __umulh(x, y); #elif defined(HAS_INT128_T) __uint128_t xl = x, yl = y; @@ -284,9 +337,8 @@ static inline uint64_t libdivide_mullhi_u64(uint64_t x, uint64_t y) { #endif } -static inline int64_t libdivide_mullhi_s64(int64_t x, int64_t y) { -#if defined(LIBDIVIDE_VC) && \ - defined(LIBDIVIDE_X86_64) +static LIBDIVIDE_INLINE int64_t libdivide_mullhi_s64(int64_t x, int64_t y) { +#if defined(LIBDIVIDE_VC) && defined(LIBDIVIDE_X86_64) return __mulh(x, y); #elif defined(HAS_INT128_T) __int128_t xl = x, yl = y; @@ -307,9 +359,41 @@ static inline int64_t libdivide_mullhi_s64(int64_t x, int64_t y) { #endif } -static inline int32_t libdivide_count_leading_zeros32(uint32_t val) { -#if defined(__GNUC__) || \ - __has_builtin(__builtin_clz) +static LIBDIVIDE_INLINE int16_t libdivide_count_leading_zeros16(uint16_t val) { +#if defined(__AVR__) + // Fast way to count leading zeros + // On the AVR 8-bit architecture __builtin_clz() works on a int16_t. + return __builtin_clz(val); +#elif defined(__GNUC__) || __has_builtin(__builtin_clz) + // Fast way to count leading zeros + return __builtin_clz(val) - 16; +#elif defined(LIBDIVIDE_VC) + unsigned long result; + if (_BitScanReverse(&result, (unsigned long)val)) { + return (int16_t)(15 - result); + } + return 0; +#else + if (val == 0) return 16; + int16_t result = 4; + uint16_t hi = 0xFU << 12; + while ((val & hi) == 0) { + hi >>= 4; + result += 4; + } + while (val & hi) { + result -= 1; + hi <<= 1; + } + return result; +#endif +} + +static LIBDIVIDE_INLINE int32_t libdivide_count_leading_zeros32(uint32_t val) { +#if defined(__AVR__) + // Fast way to count leading zeros + return __builtin_clzl(val); +#elif defined(__GNUC__) || __has_builtin(__builtin_clz) // Fast way to count leading zeros return __builtin_clz(val); #elif defined(LIBDIVIDE_VC) @@ -319,8 +403,7 @@ static inline int32_t libdivide_count_leading_zeros32(uint32_t val) { } return 0; #else - if (val == 0) - return 32; + if (val == 0) return 32; int32_t result = 8; uint32_t hi = 0xFFU << 24; while ((val & hi) == 0) { @@ -335,9 +418,8 @@ static inline int32_t libdivide_count_leading_zeros32(uint32_t val) { #endif } -static inline int32_t libdivide_count_leading_zeros64(uint64_t val) { -#if defined(__GNUC__) || \ - __has_builtin(__builtin_clzll) +static LIBDIVIDE_INLINE int32_t libdivide_count_leading_zeros64(uint64_t val) { +#if defined(__GNUC__) || __has_builtin(__builtin_clzll) // Fast way to count leading zeros return __builtin_clzll(val); #elif defined(LIBDIVIDE_VC) && defined(_WIN64) @@ -354,17 +436,25 @@ static inline int32_t libdivide_count_leading_zeros64(uint64_t val) { #endif } +// libdivide_32_div_16_to_16: divides a 32-bit uint {u1, u0} by a 16-bit +// uint {v}. The result must fit in 16 bits. +// Returns the quotient directly and the remainder in *r +static LIBDIVIDE_INLINE uint16_t libdivide_32_div_16_to_16( + uint16_t u1, uint16_t u0, uint16_t v, uint16_t *r) { + uint32_t n = ((uint32_t)u1 << 16) | u0; + uint16_t result = (uint16_t)(n / v); + *r = (uint16_t)(n - result * (uint32_t)v); + return result; +} + // libdivide_64_div_32_to_32: divides a 64-bit uint {u1, u0} by a 32-bit // uint {v}. The result must fit in 32 bits. // Returns the quotient directly and the remainder in *r -static inline uint32_t libdivide_64_div_32_to_32(uint32_t u1, uint32_t u0, uint32_t v, uint32_t *r) { -#if (defined(LIBDIVIDE_i386) || defined(LIBDIVIDE_X86_64)) && \ - defined(LIBDIVIDE_GCC_STYLE_ASM) +static LIBDIVIDE_INLINE uint32_t libdivide_64_div_32_to_32( + uint32_t u1, uint32_t u0, uint32_t v, uint32_t *r) { +#if (defined(LIBDIVIDE_i386) || defined(LIBDIVIDE_X86_64)) && defined(LIBDIVIDE_GCC_STYLE_ASM) uint32_t result; - __asm__("divl %[v]" - : "=a"(result), "=d"(*r) - : [v] "r"(v), "a"(u0), "d"(u1) - ); + __asm__("divl %[v]" : "=a"(result), "=d"(*r) : [v] "r"(v), "a"(u0), "d"(u1)); return result; #else uint64_t n = ((uint64_t)u1 << 32) | u0; @@ -374,108 +464,115 @@ static inline uint32_t libdivide_64_div_32_to_32(uint32_t u1, uint32_t u0, uint3 #endif } -// libdivide_128_div_64_to_64: divides a 128-bit uint {u1, u0} by a 64-bit -// uint {v}. The result must fit in 64 bits. -// Returns the quotient directly and the remainder in *r -/*static uint64_t libdivide_128_div_64_to_64(uint64_t u1, uint64_t u0, uint64_t v, uint64_t *r) { -#if defined(LIBDIVIDE_X86_64) && \ - defined(LIBDIVIDE_GCC_STYLE_ASM) +// libdivide_128_div_64_to_64: divides a 128-bit uint {numhi, numlo} by a 64-bit uint {den}. The +// result must fit in 64 bits. Returns the quotient directly and the remainder in *r +static LIBDIVIDE_INLINE uint64_t libdivide_128_div_64_to_64( + uint64_t numhi, uint64_t numlo, uint64_t den, uint64_t *r) { + // N.B. resist the temptation to use __uint128_t here. + // In LLVM compiler-rt, it performs a 128/128 -> 128 division which is many times slower than + // necessary. In gcc it's better but still slower than the divlu implementation, perhaps because + // it's not LIBDIVIDE_INLINEd. +#if defined(LIBDIVIDE_X86_64) && defined(LIBDIVIDE_GCC_STYLE_ASM) uint64_t result; - __asm__("divq %[v]" - : "=a"(result), "=d"(*r) - : [v] "r"(v), "a"(u0), "d"(u1) - ); - return result; -#elif defined(HAS_INT128_T) && \ - defined(HAS_INT128_DIV) - __uint128_t n = ((__uint128_t)u1 << 64) | u0; - uint64_t result = (uint64_t)(n / v); - *r = (uint64_t)(n - result * (__uint128_t)v); + __asm__("divq %[v]" : "=a"(result), "=d"(*r) : [v] "r"(den), "a"(numlo), "d"(numhi)); return result; #else - // Code taken from Hacker's Delight: - // http://www.hackersdelight.org/HDcode/divlu.c. - // License permits inclusion here per: - // http://www.hackersdelight.org/permissions.htm + // We work in base 2**32. + // A uint32 holds a single digit. A uint64 holds two digits. + // Our numerator is conceptually [num3, num2, num1, num0]. + // Our denominator is [den1, den0]. + const uint64_t b = ((uint64_t)1 << 32); - const uint64_t b = (1ULL << 32); // Number base (32 bits) - uint64_t un1, un0; // Norm. dividend LSD's - uint64_t vn1, vn0; // Norm. divisor digits - uint64_t q1, q0; // Quotient digits - uint64_t un64, un21, un10; // Dividend digit pairs - uint64_t rhat; // A remainder - int32_t s; // Shift amount for norm + // The high and low digits of our computed quotient. + uint32_t q1; + uint32_t q0; - // If overflow, set rem. to an impossible value, - // and return the largest possible quotient - if (u1 >= v) { - *r = (uint64_t) -1; - return (uint64_t) -1; + // The normalization shift factor. + int shift; + + // The high and low digits of our denominator (after normalizing). + // Also the low 2 digits of our numerator (after normalizing). + uint32_t den1; + uint32_t den0; + uint32_t num1; + uint32_t num0; + + // A partial remainder. + uint64_t rem; + + // The estimated quotient, and its corresponding remainder (unrelated to true remainder). + uint64_t qhat; + uint64_t rhat; + + // Variables used to correct the estimated quotient. + uint64_t c1; + uint64_t c2; + + // Check for overflow and divide by 0. + if (numhi >= den) { + if (r != NULL) *r = ~0ull; + return ~0ull; } - // count leading zeros - s = libdivide_count_leading_zeros64(v); - if (s > 0) { - // Normalize divisor - v = v << s; - un64 = (u1 << s) | (u0 >> (64 - s)); - un10 = u0 << s; // Shift dividend left - } else { - // Avoid undefined behavior of (u0 >> 64). - // The behavior is undefined if the right operand is - // negative, or greater than or equal to the length - // in bits of the promoted left operand. - un64 = u1; - un10 = u0; - } + // Determine the normalization factor. We multiply den by this, so that its leading digit is at + // least half b. In binary this means just shifting left by the number of leading zeros, so that + // there's a 1 in the MSB. + // We also shift numer by the same amount. This cannot overflow because numhi < den. + // The expression (-shift & 63) is the same as (64 - shift), except it avoids the UB of shifting + // by 64. The funny bitwise 'and' ensures that numlo does not get shifted into numhi if shift is + // 0. clang 11 has an x86 codegen bug here: see LLVM bug 50118. The sequence below avoids it. + shift = libdivide_count_leading_zeros64(den); + den <<= shift; + numhi <<= shift; + numhi |= (numlo >> (-shift & 63)) & (-(int64_t)shift >> 63); + numlo <<= shift; - // Break divisor up into two 32-bit digits - vn1 = v >> 32; - vn0 = v & 0xFFFFFFFF; + // Extract the low digits of the numerator and both digits of the denominator. + num1 = (uint32_t)(numlo >> 32); + num0 = (uint32_t)(numlo & 0xFFFFFFFFu); + den1 = (uint32_t)(den >> 32); + den0 = (uint32_t)(den & 0xFFFFFFFFu); - // Break right half of dividend into two digits - un1 = un10 >> 32; - un0 = un10 & 0xFFFFFFFF; + // We wish to compute q1 = [n3 n2 n1] / [d1 d0]. + // Estimate q1 as [n3 n2] / [d1], and then correct it. + // Note while qhat may be 2 digits, q1 is always 1 digit. + qhat = numhi / den1; + rhat = numhi % den1; + c1 = qhat * den0; + c2 = rhat * b + num1; + if (c1 > c2) qhat -= (c1 - c2 > den) ? 2 : 1; + q1 = (uint32_t)qhat; - // Compute the first quotient digit, q1 - q1 = un64 / vn1; - rhat = un64 - q1 * vn1; + // Compute the true (partial) remainder. + rem = numhi * b + num1 - q1 * den; - while (q1 >= b || q1 * vn0 > b * rhat + un1) { - q1 = q1 - 1; - rhat = rhat + vn1; - if (rhat >= b) - break; - } + // We wish to compute q0 = [rem1 rem0 n0] / [d1 d0]. + // Estimate q0 as [rem1 rem0] / [d1] and correct it. + qhat = rem / den1; + rhat = rem % den1; + c1 = qhat * den0; + c2 = rhat * b + num0; + if (c1 > c2) qhat -= (c1 - c2 > den) ? 2 : 1; + q0 = (uint32_t)qhat; - // Multiply and subtract - un21 = un64 * b + un1 - q1 * v; - - // Compute the second quotient digit - q0 = un21 / vn1; - rhat = un21 - q0 * vn1; - - while (q0 >= b || q0 * vn0 > b * rhat + un0) { - q0 = q0 - 1; - rhat = rhat + vn1; - if (rhat >= b) - break; - } - - *r = (un21 * b + un0 - q0 * v) >> s; - return q1 * b + q0; + // Return remainder if requested. + if (r != NULL) *r = (rem * b + num0 - q0 * den) >> shift; + return ((uint64_t)q1 << 32) | q0; #endif -}*/ +} + +#if !(defined(HAS_INT128_T) && \ + defined(HAS_INT128_DIV)) // Bitshift a u128 in place, left (signed_shift > 0) or right (signed_shift < 0) -static inline void libdivide_u128_shift(uint64_t *u1, uint64_t *u0, int32_t signed_shift) { +static LIBDIVIDE_INLINE void libdivide_u128_shift( + uint64_t *u1, uint64_t *u0, int32_t signed_shift) { if (signed_shift > 0) { uint32_t shift = signed_shift; *u1 <<= shift; *u1 |= *u0 >> (64 - shift); *u0 <<= shift; - } - else if (signed_shift < 0) { + } else if (signed_shift < 0) { uint32_t shift = -signed_shift; *u0 >>= shift; *u0 |= *u1 << (64 - shift); @@ -483,10 +580,12 @@ static inline void libdivide_u128_shift(uint64_t *u1, uint64_t *u0, int32_t sign } } +#endif + // Computes a 128 / 128 -> 64 bit division, with a 128 bit remainder. -/*static uint64_t libdivide_128_div_128_to_64(uint64_t u_hi, uint64_t u_lo, uint64_t v_hi, uint64_t v_lo, uint64_t *r_hi, uint64_t *r_lo) { -#if defined(HAS_INT128_T) && \ - defined(HAS_INT128_DIV) +static LIBDIVIDE_INLINE uint64_t libdivide_128_div_128_to_64( + uint64_t u_hi, uint64_t u_lo, uint64_t v_hi, uint64_t v_lo, uint64_t *r_hi, uint64_t *r_lo) { +#if defined(HAS_INT128_T) && defined(HAS_INT128_DIV) __uint128_t ufull = u_hi; __uint128_t vfull = v_hi; ufull = (ufull << 64) | u_lo; @@ -499,7 +598,10 @@ static inline void libdivide_u128_shift(uint64_t *u1, uint64_t *u0, int32_t sign #else // Adapted from "Unsigned Doubleword Division" in Hacker's Delight // We want to compute u / v - typedef struct { uint64_t hi; uint64_t lo; } u128_t; + typedef struct { + uint64_t hi; + uint64_t lo; + } u128_t; u128_t u = {u_hi, u_lo}; u128_t v = {v_hi, v_lo}; @@ -519,7 +621,7 @@ static inline void libdivide_u128_shift(uint64_t *u1, uint64_t *u0, int32_t sign // Normalize the divisor so its MSB is 1 u128_t v1t = v; libdivide_u128_shift(&v1t.hi, &v1t.lo, n); - uint64_t v1 = v1t.hi; // i.e. v1 = v1t >> 64 + uint64_t v1 = v1t.hi; // i.e. v1 = v1t >> 64 // To ensure no overflow u128_t u1 = u; @@ -537,7 +639,7 @@ static inline void libdivide_u128_shift(uint64_t *u1, uint64_t *u0, int32_t sign // Make q0 correct or too small by 1 // Equivalent to `if (q0 != 0) q0 = q0 - 1;` if (q0.hi != 0 || q0.lo != 0) { - q0.hi -= (q0.lo == 0); // borrow + q0.hi -= (q0.lo == 0); // borrow q0.lo -= 1; } @@ -549,22 +651,21 @@ static inline void libdivide_u128_shift(uint64_t *u1, uint64_t *u0, int32_t sign // Each term is 128 bit // High half of full product (upper 128 bits!) are dropped u128_t q0v = {0, 0}; - q0v.hi = q0.hi*v.lo + q0.lo*v.hi + libdivide_mullhi_u64(q0.lo, v.lo); - q0v.lo = q0.lo*v.lo; + q0v.hi = q0.hi * v.lo + q0.lo * v.hi + libdivide_mullhi_u64(q0.lo, v.lo); + q0v.lo = q0.lo * v.lo; // Compute u - q0v as u_q0v // This is the remainder u128_t u_q0v = u; - u_q0v.hi -= q0v.hi + (u.lo < q0v.lo); // second term is borrow + u_q0v.hi -= q0v.hi + (u.lo < q0v.lo); // second term is borrow u_q0v.lo -= q0v.lo; // Check if u_q0v >= v // This checks if our remainder is larger than the divisor - if ((u_q0v.hi > v.hi) || - (u_q0v.hi == v.hi && u_q0v.lo >= v.lo)) { + if ((u_q0v.hi > v.hi) || (u_q0v.hi == v.hi && u_q0v.lo >= v.lo)) { // Increment q0 q0.lo += 1; - q0.hi += (q0.lo == 0); // carry + q0.hi += (q0.lo == 0); // carry // Subtract v from remainder u_q0v.hi -= v.hi + (u_q0v.lo < v.lo); @@ -577,19 +678,182 @@ static inline void libdivide_u128_shift(uint64_t *u1, uint64_t *u0, int32_t sign LIBDIVIDE_ASSERT(q0.hi == 0); return q0.lo; #endif -}*/ +} -////////// UINT32 - -static inline struct libdivide_u32_t libdivide_internal_u32_gen(uint32_t d, int branchfree) { - struct libdivide_u32_t result; - uint32_t floor_log_2_d; +////////// UINT16 +static LIBDIVIDE_INLINE struct libdivide_u16_t libdivide_internal_u16_gen( + uint16_t d, int branchfree) { if (d == 0) { LIBDIVIDE_ERROR("divider must be != 0"); } - floor_log_2_d = 31 - libdivide_count_leading_zeros32(d); + struct libdivide_u16_t result; + uint8_t floor_log_2_d = (uint8_t)(15 - libdivide_count_leading_zeros16(d)); + + // Power of 2 + if ((d & (d - 1)) == 0) { + // We need to subtract 1 from the shift value in case of an unsigned + // branchfree divider because there is a hardcoded right shift by 1 + // in its division algorithm. Because of this we also need to add back + // 1 in its recovery algorithm. + result.magic = 0; + result.more = (uint8_t)(floor_log_2_d - (branchfree != 0)); + } else { + uint8_t more; + uint16_t rem, proposed_m; + proposed_m = libdivide_32_div_16_to_16((uint16_t)1 << floor_log_2_d, 0, d, &rem); + + LIBDIVIDE_ASSERT(rem > 0 && rem < d); + const uint16_t e = d - rem; + + // This power works if e < 2**floor_log_2_d. + if (!branchfree && (e < ((uint16_t)1 << floor_log_2_d))) { + // This power works + more = floor_log_2_d; + } else { + // We have to use the general 17-bit algorithm. We need to compute + // (2**power) / d. However, we already have (2**(power-1))/d and + // its remainder. By doubling both, and then correcting the + // remainder, we can compute the larger division. + // don't care about overflow here - in fact, we expect it + proposed_m += proposed_m; + const uint16_t twice_rem = rem + rem; + if (twice_rem >= d || twice_rem < rem) proposed_m += 1; + more = floor_log_2_d | LIBDIVIDE_ADD_MARKER; + } + result.magic = 1 + proposed_m; + result.more = more; + // result.more's shift should in general be ceil_log_2_d. But if we + // used the smaller power, we subtract one from the shift because we're + // using the smaller power. If we're using the larger power, we + // subtract one from the shift because it's taken care of by the add + // indicator. So floor_log_2_d happens to be correct in both cases. + } + return result; +} + +struct libdivide_u16_t libdivide_u16_gen(uint16_t d) { + return libdivide_internal_u16_gen(d, 0); +} + +struct libdivide_u16_branchfree_t libdivide_u16_branchfree_gen(uint16_t d) { + if (d == 1) { + LIBDIVIDE_ERROR("branchfree divider must be != 1"); + } + struct libdivide_u16_t tmp = libdivide_internal_u16_gen(d, 1); + struct libdivide_u16_branchfree_t ret = { + tmp.magic, (uint8_t)(tmp.more & LIBDIVIDE_16_SHIFT_MASK)}; + return ret; +} + +// The original libdivide_u16_do takes a const pointer. However, this cannot be used +// with a compile time constant libdivide_u16_t: it will generate a warning about +// taking the address of a temporary. Hence this overload. +uint16_t libdivide_u16_do_raw(uint16_t numer, uint16_t magic, uint8_t more) { + if (!magic) { + return numer >> more; + } else { + uint16_t q = libdivide_mullhi_u16(magic, numer); + if (more & LIBDIVIDE_ADD_MARKER) { + uint16_t t = ((numer - q) >> 1) + q; + return t >> (more & LIBDIVIDE_16_SHIFT_MASK); + } else { + // All upper bits are 0, + // don't need to mask them off. + return q >> more; + } + } +} + +uint16_t libdivide_u16_do(uint16_t numer, const struct libdivide_u16_t *denom) { + return libdivide_u16_do_raw(numer, denom->magic, denom->more); +} + +uint16_t libdivide_u16_branchfree_do( + uint16_t numer, const struct libdivide_u16_branchfree_t *denom) { + uint16_t q = libdivide_mullhi_u16(denom->magic, numer); + uint16_t t = ((numer - q) >> 1) + q; + return t >> denom->more; +} + +uint16_t libdivide_u16_recover(const struct libdivide_u16_t *denom) { + uint8_t more = denom->more; + uint8_t shift = more & LIBDIVIDE_16_SHIFT_MASK; + + if (!denom->magic) { + return (uint16_t)1 << shift; + } else if (!(more & LIBDIVIDE_ADD_MARKER)) { + // We compute q = n/d = n*m / 2^(16 + shift) + // Therefore we have d = 2^(16 + shift) / m + // We need to ceil it. + // We know d is not a power of 2, so m is not a power of 2, + // so we can just add 1 to the floor + uint16_t hi_dividend = (uint16_t)1 << shift; + uint16_t rem_ignored; + return 1 + libdivide_32_div_16_to_16(hi_dividend, 0, denom->magic, &rem_ignored); + } else { + // Here we wish to compute d = 2^(16+shift+1)/(m+2^16). + // Notice (m + 2^16) is a 17 bit number. Use 32 bit division for now + // Also note that shift may be as high as 15, so shift + 1 will + // overflow. So we have to compute it as 2^(16+shift)/(m+2^16), and + // then double the quotient and remainder. + uint32_t half_n = (uint32_t)1 << (16 + shift); + uint32_t d = ((uint32_t)1 << 16) | denom->magic; + // Note that the quotient is guaranteed <= 16 bits, but the remainder + // may need 17! + uint16_t half_q = (uint16_t)(half_n / d); + uint32_t rem = half_n % d; + // We computed 2^(16+shift)/(m+2^16) + // Need to double it, and then add 1 to the quotient if doubling th + // remainder would increase the quotient. + // Note that rem<<1 cannot overflow, since rem < d and d is 17 bits + uint16_t full_q = half_q + half_q + ((rem << 1) >= d); + + // We rounded down in gen (hence +1) + return full_q + 1; + } +} + +uint16_t libdivide_u16_branchfree_recover(const struct libdivide_u16_branchfree_t *denom) { + uint8_t more = denom->more; + uint8_t shift = more & LIBDIVIDE_16_SHIFT_MASK; + + if (!denom->magic) { + return (uint16_t)1 << (shift + 1); + } else { + // Here we wish to compute d = 2^(16+shift+1)/(m+2^16). + // Notice (m + 2^16) is a 17 bit number. Use 32 bit division for now + // Also note that shift may be as high as 15, so shift + 1 will + // overflow. So we have to compute it as 2^(16+shift)/(m+2^16), and + // then double the quotient and remainder. + uint32_t half_n = (uint32_t)1 << (16 + shift); + uint32_t d = ((uint32_t)1 << 16) | denom->magic; + // Note that the quotient is guaranteed <= 16 bits, but the remainder + // may need 17! + uint16_t half_q = (uint16_t)(half_n / d); + uint32_t rem = half_n % d; + // We computed 2^(16+shift)/(m+2^16) + // Need to double it, and then add 1 to the quotient if doubling th + // remainder would increase the quotient. + // Note that rem<<1 cannot overflow, since rem < d and d is 33 bits + uint16_t full_q = half_q + half_q + ((rem << 1) >= d); + + // We rounded down in gen (hence +1) + return full_q + 1; + } +} + +////////// UINT32 + +static LIBDIVIDE_INLINE struct libdivide_u32_t libdivide_internal_u32_gen( + uint32_t d, int branchfree) { + if (d == 0) { + LIBDIVIDE_ERROR("divider must be != 0"); + } + + struct libdivide_u32_t result; + uint32_t floor_log_2_d = 31 - libdivide_count_leading_zeros32(d); // Power of 2 if ((d & (d - 1)) == 0) { @@ -602,26 +866,25 @@ static inline struct libdivide_u32_t libdivide_internal_u32_gen(uint32_t d, int } else { uint8_t more; uint32_t rem, proposed_m; - uint32_t e; - proposed_m = libdivide_64_div_32_to_32(1U << floor_log_2_d, 0, d, &rem); + proposed_m = libdivide_64_div_32_to_32((uint32_t)1 << floor_log_2_d, 0, d, &rem); LIBDIVIDE_ASSERT(rem > 0 && rem < d); - e = d - rem; + const uint32_t e = d - rem; // This power works if e < 2**floor_log_2_d. - if (!branchfree && (e < (1U << floor_log_2_d))) { + if (!branchfree && (e < ((uint32_t)1 << floor_log_2_d))) { // This power works - more = floor_log_2_d; + more = (uint8_t)floor_log_2_d; } else { // We have to use the general 33-bit algorithm. We need to compute // (2**power) / d. However, we already have (2**(power-1))/d and // its remainder. By doubling both, and then correcting the // remainder, we can compute the larger division. // don't care about overflow here - in fact, we expect it - const uint32_t twice_rem = rem + rem; proposed_m += proposed_m; + const uint32_t twice_rem = rem + rem; if (twice_rem >= d || twice_rem < rem) proposed_m += 1; - more = floor_log_2_d | LIBDIVIDE_ADD_MARKER; + more = (uint8_t)(floor_log_2_d | LIBDIVIDE_ADD_MARKER); } result.magic = 1 + proposed_m; result.more = more; @@ -638,27 +901,26 @@ struct libdivide_u32_t libdivide_u32_gen(uint32_t d) { return libdivide_internal_u32_gen(d, 0); } -/*struct libdivide_u32_branchfree_t libdivide_u32_branchfree_gen(uint32_t d) { +struct libdivide_u32_branchfree_t libdivide_u32_branchfree_gen(uint32_t d) { if (d == 1) { LIBDIVIDE_ERROR("branchfree divider must be != 1"); } struct libdivide_u32_t tmp = libdivide_internal_u32_gen(d, 1); - struct libdivide_u32_branchfree_t ret = {tmp.magic, (uint8_t)(tmp.more & LIBDIVIDE_32_SHIFT_MASK)}; + struct libdivide_u32_branchfree_t ret = { + tmp.magic, (uint8_t)(tmp.more & LIBDIVIDE_32_SHIFT_MASK)}; return ret; -}*/ +} uint32_t libdivide_u32_do(uint32_t numer, const struct libdivide_u32_t *denom) { uint8_t more = denom->more; if (!denom->magic) { return numer >> more; - } - else { + } else { uint32_t q = libdivide_mullhi_u32(denom->magic, numer); if (more & LIBDIVIDE_ADD_MARKER) { uint32_t t = ((numer - q) >> 1) + q; return t >> (more & LIBDIVIDE_32_SHIFT_MASK); - } - else { + } else { // All upper bits are 0, // don't need to mask them off. return q >> more; @@ -666,7 +928,8 @@ uint32_t libdivide_u32_do(uint32_t numer, const struct libdivide_u32_t *denom) { } } -/*uint32_t libdivide_u32_branchfree_do(uint32_t numer, const struct libdivide_u32_branchfree_t *denom) { +uint32_t libdivide_u32_branchfree_do( + uint32_t numer, const struct libdivide_u32_branchfree_t *denom) { uint32_t q = libdivide_mullhi_u32(denom->magic, numer); uint32_t t = ((numer - q) >> 1) + q; return t >> denom->more; @@ -677,14 +940,14 @@ uint32_t libdivide_u32_recover(const struct libdivide_u32_t *denom) { uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK; if (!denom->magic) { - return 1U << shift; + return (uint32_t)1 << shift; } else if (!(more & LIBDIVIDE_ADD_MARKER)) { // We compute q = n/d = n*m / 2^(32 + shift) // Therefore we have d = 2^(32 + shift) / m // We need to ceil it. // We know d is not a power of 2, so m is not a power of 2, // so we can just add 1 to the floor - uint32_t hi_dividend = 1U << shift; + uint32_t hi_dividend = (uint32_t)1 << shift; uint32_t rem_ignored; return 1 + libdivide_64_div_32_to_32(hi_dividend, 0, denom->magic, &rem_ignored); } else { @@ -693,8 +956,8 @@ uint32_t libdivide_u32_recover(const struct libdivide_u32_t *denom) { // Also note that shift may be as high as 31, so shift + 1 will // overflow. So we have to compute it as 2^(32+shift)/(m+2^32), and // then double the quotient and remainder. - uint64_t half_n = 1ULL << (32 + shift); - uint64_t d = (1ULL << 32) | denom->magic; + uint64_t half_n = (uint64_t)1 << (32 + shift); + uint64_t d = ((uint64_t)1 << 32) | denom->magic; // Note that the quotient is guaranteed <= 32 bits, but the remainder // may need 33! uint32_t half_q = (uint32_t)(half_n / d); @@ -703,7 +966,7 @@ uint32_t libdivide_u32_recover(const struct libdivide_u32_t *denom) { // Need to double it, and then add 1 to the quotient if doubling th // remainder would increase the quotient. // Note that rem<<1 cannot overflow, since rem < d and d is 33 bits - uint32_t full_q = half_q + half_q + ((rem<<1) >= d); + uint32_t full_q = half_q + half_q + ((rem << 1) >= d); // We rounded down in gen (hence +1) return full_q + 1; @@ -715,15 +978,15 @@ uint32_t libdivide_u32_branchfree_recover(const struct libdivide_u32_branchfree_ uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK; if (!denom->magic) { - return 1U << (shift + 1); + return (uint32_t)1 << (shift + 1); } else { // Here we wish to compute d = 2^(32+shift+1)/(m+2^32). // Notice (m + 2^32) is a 33 bit number. Use 64 bit division for now // Also note that shift may be as high as 31, so shift + 1 will // overflow. So we have to compute it as 2^(32+shift)/(m+2^32), and // then double the quotient and remainder. - uint64_t half_n = 1ULL << (32 + shift); - uint64_t d = (1ULL << 32) | denom->magic; + uint64_t half_n = (uint64_t)1 << (32 + shift); + uint64_t d = ((uint64_t)1 << 32) | denom->magic; // Note that the quotient is guaranteed <= 32 bits, but the remainder // may need 33! uint32_t half_q = (uint32_t)(half_n / d); @@ -732,16 +995,17 @@ uint32_t libdivide_u32_branchfree_recover(const struct libdivide_u32_branchfree_ // Need to double it, and then add 1 to the quotient if doubling th // remainder would increase the quotient. // Note that rem<<1 cannot overflow, since rem < d and d is 33 bits - uint32_t full_q = half_q + half_q + ((rem<<1) >= d); + uint32_t full_q = half_q + half_q + ((rem << 1) >= d); // We rounded down in gen (hence +1) return full_q + 1; } -}*/ +} /////////// UINT64 -/*static inline struct libdivide_u64_t libdivide_internal_u64_gen(uint64_t d, int branchfree) { +static LIBDIVIDE_INLINE struct libdivide_u64_t libdivide_internal_u64_gen( + uint64_t d, int branchfree) { if (d == 0) { LIBDIVIDE_ERROR("divider must be != 0"); } @@ -761,15 +1025,15 @@ uint32_t libdivide_u32_branchfree_recover(const struct libdivide_u32_branchfree_ uint64_t proposed_m, rem; uint8_t more; // (1 << (64 + floor_log_2_d)) / d - proposed_m = libdivide_128_div_64_to_64(1ULL << floor_log_2_d, 0, d, &rem); + proposed_m = libdivide_128_div_64_to_64((uint64_t)1 << floor_log_2_d, 0, d, &rem); LIBDIVIDE_ASSERT(rem > 0 && rem < d); const uint64_t e = d - rem; // This power works if e < 2**floor_log_2_d. - if (!branchfree && e < (1ULL << floor_log_2_d)) { + if (!branchfree && e < ((uint64_t)1 << floor_log_2_d)) { // This power works - more = floor_log_2_d; + more = (uint8_t)floor_log_2_d; } else { // We have to use the general 65-bit algorithm. We need to compute // (2**power) / d. However, we already have (2**(power-1))/d and @@ -779,7 +1043,7 @@ uint32_t libdivide_u32_branchfree_recover(const struct libdivide_u32_branchfree_ proposed_m += proposed_m; const uint64_t twice_rem = rem + rem; if (twice_rem >= d || twice_rem < rem) proposed_m += 1; - more = floor_log_2_d | LIBDIVIDE_ADD_MARKER; + more = (uint8_t)(floor_log_2_d | LIBDIVIDE_ADD_MARKER); } result.magic = 1 + proposed_m; result.more = more; @@ -802,7 +1066,8 @@ struct libdivide_u64_branchfree_t libdivide_u64_branchfree_gen(uint64_t d) { LIBDIVIDE_ERROR("branchfree divider must be != 1"); } struct libdivide_u64_t tmp = libdivide_internal_u64_gen(d, 1); - struct libdivide_u64_branchfree_t ret = {tmp.magic, (uint8_t)(tmp.more & LIBDIVIDE_64_SHIFT_MASK)}; + struct libdivide_u64_branchfree_t ret = { + tmp.magic, (uint8_t)(tmp.more & LIBDIVIDE_64_SHIFT_MASK)}; return ret; } @@ -810,22 +1075,21 @@ uint64_t libdivide_u64_do(uint64_t numer, const struct libdivide_u64_t *denom) { uint8_t more = denom->more; if (!denom->magic) { return numer >> more; - } - else { + } else { uint64_t q = libdivide_mullhi_u64(denom->magic, numer); if (more & LIBDIVIDE_ADD_MARKER) { uint64_t t = ((numer - q) >> 1) + q; return t >> (more & LIBDIVIDE_64_SHIFT_MASK); - } - else { - // All upper bits are 0, - // don't need to mask them off. + } else { + // All upper bits are 0, + // don't need to mask them off. return q >> more; } } } -uint64_t libdivide_u64_branchfree_do(uint64_t numer, const struct libdivide_u64_branchfree_t *denom) { +uint64_t libdivide_u64_branchfree_do( + uint64_t numer, const struct libdivide_u64_branchfree_t *denom) { uint64_t q = libdivide_mullhi_u64(denom->magic, numer); uint64_t t = ((numer - q) >> 1) + q; return t >> denom->more; @@ -836,14 +1100,14 @@ uint64_t libdivide_u64_recover(const struct libdivide_u64_t *denom) { uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK; if (!denom->magic) { - return 1ULL << shift; + return (uint64_t)1 << shift; } else if (!(more & LIBDIVIDE_ADD_MARKER)) { // We compute q = n/d = n*m / 2^(64 + shift) // Therefore we have d = 2^(64 + shift) / m // We need to ceil it. // We know d is not a power of 2, so m is not a power of 2, // so we can just add 1 to the floor - uint64_t hi_dividend = 1ULL << shift; + uint64_t hi_dividend = (uint64_t)1 << shift; uint64_t rem_ignored; return 1 + libdivide_128_div_64_to_64(hi_dividend, 0, denom->magic, &rem_ignored); } else { @@ -855,19 +1119,20 @@ uint64_t libdivide_u64_recover(const struct libdivide_u64_t *denom) { // Full n is a (potentially) 129 bit value // half_n is a 128 bit value // Compute the hi half of half_n. Low half is 0. - uint64_t half_n_hi = 1ULL << shift, half_n_lo = 0; + uint64_t half_n_hi = (uint64_t)1 << shift, half_n_lo = 0; // d is a 65 bit value. The high bit is always set to 1. const uint64_t d_hi = 1, d_lo = denom->magic; // Note that the quotient is guaranteed <= 64 bits, // but the remainder may need 65! uint64_t r_hi, r_lo; - uint64_t half_q = libdivide_128_div_128_to_64(half_n_hi, half_n_lo, d_hi, d_lo, &r_hi, &r_lo); + uint64_t half_q = + libdivide_128_div_128_to_64(half_n_hi, half_n_lo, d_hi, d_lo, &r_hi, &r_lo); // We computed 2^(64+shift)/(m+2^64) // Double the remainder ('dr') and check if that is larger than d // Note that d is a 65 bit value, so r1 is small and so r1 + r1 // cannot overflow uint64_t dr_lo = r_lo + r_lo; - uint64_t dr_hi = r_hi + r_hi + (dr_lo < r_lo); // last term is carry + uint64_t dr_hi = r_hi + r_hi + (dr_lo < r_lo); // last term is carry int dr_exceeds_d = (dr_hi > d_hi) || (dr_hi == d_hi && dr_lo >= d_lo); uint64_t full_q = half_q + half_q + (dr_exceeds_d ? 1 : 0); return full_q + 1; @@ -879,7 +1144,7 @@ uint64_t libdivide_u64_branchfree_recover(const struct libdivide_u64_branchfree_ uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK; if (!denom->magic) { - return 1ULL << (shift + 1); + return (uint64_t)1 << (shift + 1); } else { // Here we wish to compute d = 2^(64+shift+1)/(m+2^64). // Notice (m + 2^64) is a 65 bit number. This gets hairy. See @@ -889,28 +1154,205 @@ uint64_t libdivide_u64_branchfree_recover(const struct libdivide_u64_branchfree_ // Full n is a (potentially) 129 bit value // half_n is a 128 bit value // Compute the hi half of half_n. Low half is 0. - uint64_t half_n_hi = 1ULL << shift, half_n_lo = 0; + uint64_t half_n_hi = (uint64_t)1 << shift, half_n_lo = 0; // d is a 65 bit value. The high bit is always set to 1. const uint64_t d_hi = 1, d_lo = denom->magic; // Note that the quotient is guaranteed <= 64 bits, // but the remainder may need 65! uint64_t r_hi, r_lo; - uint64_t half_q = libdivide_128_div_128_to_64(half_n_hi, half_n_lo, d_hi, d_lo, &r_hi, &r_lo); + uint64_t half_q = + libdivide_128_div_128_to_64(half_n_hi, half_n_lo, d_hi, d_lo, &r_hi, &r_lo); // We computed 2^(64+shift)/(m+2^64) // Double the remainder ('dr') and check if that is larger than d // Note that d is a 65 bit value, so r1 is small and so r1 + r1 // cannot overflow uint64_t dr_lo = r_lo + r_lo; - uint64_t dr_hi = r_hi + r_hi + (dr_lo < r_lo); // last term is carry + uint64_t dr_hi = r_hi + r_hi + (dr_lo < r_lo); // last term is carry int dr_exceeds_d = (dr_hi > d_hi) || (dr_hi == d_hi && dr_lo >= d_lo); uint64_t full_q = half_q + half_q + (dr_exceeds_d ? 1 : 0); return full_q + 1; } -}*/ +} + +/////////// SINT16 + +static LIBDIVIDE_INLINE struct libdivide_s16_t libdivide_internal_s16_gen( + int16_t d, int branchfree) { + if (d == 0) { + LIBDIVIDE_ERROR("divider must be != 0"); + } + + struct libdivide_s16_t result; + + // If d is a power of 2, or negative a power of 2, we have to use a shift. + // This is especially important because the magic algorithm fails for -1. + // To check if d is a power of 2 or its inverse, it suffices to check + // whether its absolute value has exactly one bit set. This works even for + // INT_MIN, because abs(INT_MIN) == INT_MIN, and INT_MIN has one bit set + // and is a power of 2. + uint16_t ud = (uint16_t)d; + uint16_t absD = (d < 0) ? -ud : ud; + uint16_t floor_log_2_d = 15 - libdivide_count_leading_zeros16(absD); + // check if exactly one bit is set, + // don't care if absD is 0 since that's divide by zero + if ((absD & (absD - 1)) == 0) { + // Branchfree and normal paths are exactly the same + result.magic = 0; + result.more = (uint8_t)(floor_log_2_d | (d < 0 ? LIBDIVIDE_NEGATIVE_DIVISOR : 0)); + } else { + LIBDIVIDE_ASSERT(floor_log_2_d >= 1); + + uint8_t more; + // the dividend here is 2**(floor_log_2_d + 31), so the low 16 bit word + // is 0 and the high word is floor_log_2_d - 1 + uint16_t rem, proposed_m; + proposed_m = libdivide_32_div_16_to_16((uint16_t)1 << (floor_log_2_d - 1), 0, absD, &rem); + const uint16_t e = absD - rem; + + // We are going to start with a power of floor_log_2_d - 1. + // This works if works if e < 2**floor_log_2_d. + if (!branchfree && e < ((uint16_t)1 << floor_log_2_d)) { + // This power works + more = (uint8_t)(floor_log_2_d - 1); + } else { + // We need to go one higher. This should not make proposed_m + // overflow, but it will make it negative when interpreted as an + // int16_t. + proposed_m += proposed_m; + const uint16_t twice_rem = rem + rem; + if (twice_rem >= absD || twice_rem < rem) proposed_m += 1; + more = (uint8_t)(floor_log_2_d | LIBDIVIDE_ADD_MARKER); + } + + proposed_m += 1; + int16_t magic = (int16_t)proposed_m; + + // Mark if we are negative. Note we only negate the magic number in the + // branchfull case. + if (d < 0) { + more |= LIBDIVIDE_NEGATIVE_DIVISOR; + if (!branchfree) { + magic = -magic; + } + } + + result.more = more; + result.magic = magic; + } + return result; +} + +struct libdivide_s16_t libdivide_s16_gen(int16_t d) { + return libdivide_internal_s16_gen(d, 0); +} + +struct libdivide_s16_branchfree_t libdivide_s16_branchfree_gen(int16_t d) { + struct libdivide_s16_t tmp = libdivide_internal_s16_gen(d, 1); + struct libdivide_s16_branchfree_t result = {tmp.magic, tmp.more}; + return result; +} + +// The original libdivide_s16_do takes a const pointer. However, this cannot be used +// with a compile time constant libdivide_s16_t: it will generate a warning about +// taking the address of a temporary. Hence this overload. +int16_t libdivide_s16_do_raw(int16_t numer, int16_t magic, uint8_t more) { + uint8_t shift = more & LIBDIVIDE_16_SHIFT_MASK; + + if (!magic) { + uint16_t sign = (int8_t)more >> 7; + uint16_t mask = ((uint16_t)1 << shift) - 1; + uint16_t uq = numer + ((numer >> 15) & mask); + int16_t q = (int16_t)uq; + q >>= shift; + q = (q ^ sign) - sign; + return q; + } else { + uint16_t uq = (uint16_t)libdivide_mullhi_s16(magic, numer); + if (more & LIBDIVIDE_ADD_MARKER) { + // must be arithmetic shift and then sign extend + int16_t sign = (int8_t)more >> 7; + // q += (more < 0 ? -numer : numer) + // cast required to avoid UB + uq += ((uint16_t)numer ^ sign) - sign; + } + int16_t q = (int16_t)uq; + q >>= shift; + q += (q < 0); + return q; + } +} + +int16_t libdivide_s16_do(int16_t numer, const struct libdivide_s16_t *denom) { + return libdivide_s16_do_raw(numer, denom->magic, denom->more); +} + +int16_t libdivide_s16_branchfree_do(int16_t numer, const struct libdivide_s16_branchfree_t *denom) { + uint8_t more = denom->more; + uint8_t shift = more & LIBDIVIDE_16_SHIFT_MASK; + // must be arithmetic shift and then sign extend + int16_t sign = (int8_t)more >> 7; + int16_t magic = denom->magic; + int16_t q = libdivide_mullhi_s16(magic, numer); + q += numer; + + // If q is non-negative, we have nothing to do + // If q is negative, we want to add either (2**shift)-1 if d is a power of + // 2, or (2**shift) if it is not a power of 2 + uint16_t is_power_of_2 = (magic == 0); + uint16_t q_sign = (uint16_t)(q >> 15); + q += q_sign & (((uint16_t)1 << shift) - is_power_of_2); + + // Now arithmetic right shift + q >>= shift; + // Negate if needed + q = (q ^ sign) - sign; + + return q; +} + +int16_t libdivide_s16_recover(const struct libdivide_s16_t *denom) { + uint8_t more = denom->more; + uint8_t shift = more & LIBDIVIDE_16_SHIFT_MASK; + if (!denom->magic) { + uint16_t absD = (uint16_t)1 << shift; + if (more & LIBDIVIDE_NEGATIVE_DIVISOR) { + absD = -absD; + } + return (int16_t)absD; + } else { + // Unsigned math is much easier + // We negate the magic number only in the branchfull case, and we don't + // know which case we're in. However we have enough information to + // determine the correct sign of the magic number. The divisor was + // negative if LIBDIVIDE_NEGATIVE_DIVISOR is set. If ADD_MARKER is set, + // the magic number's sign is opposite that of the divisor. + // We want to compute the positive magic number. + int negative_divisor = (more & LIBDIVIDE_NEGATIVE_DIVISOR); + int magic_was_negated = (more & LIBDIVIDE_ADD_MARKER) ? denom->magic > 0 : denom->magic < 0; + + // Handle the power of 2 case (including branchfree) + if (denom->magic == 0) { + int16_t result = (uint16_t)1 << shift; + return negative_divisor ? -result : result; + } + + uint16_t d = (uint16_t)(magic_was_negated ? -denom->magic : denom->magic); + uint32_t n = (uint32_t)1 << (16 + shift); // this shift cannot exceed 30 + uint16_t q = (uint16_t)(n / d); + int16_t result = (int16_t)q; + result += 1; + return negative_divisor ? -result : result; + } +} + +int16_t libdivide_s16_branchfree_recover(const struct libdivide_s16_branchfree_t *denom) { + return libdivide_s16_recover((const struct libdivide_s16_t *)denom); +} /////////// SINT32 -/*static inline struct libdivide_s32_t libdivide_internal_s32_gen(int32_t d, int branchfree) { +static LIBDIVIDE_INLINE struct libdivide_s32_t libdivide_internal_s32_gen( + int32_t d, int branchfree) { if (d == 0) { LIBDIVIDE_ERROR("divider must be != 0"); } @@ -931,7 +1373,7 @@ uint64_t libdivide_u64_branchfree_recover(const struct libdivide_u64_branchfree_ if ((absD & (absD - 1)) == 0) { // Branchfree and normal paths are exactly the same result.magic = 0; - result.more = floor_log_2_d | (d < 0 ? LIBDIVIDE_NEGATIVE_DIVISOR : 0); + result.more = (uint8_t)(floor_log_2_d | (d < 0 ? LIBDIVIDE_NEGATIVE_DIVISOR : 0)); } else { LIBDIVIDE_ASSERT(floor_log_2_d >= 1); @@ -939,14 +1381,14 @@ uint64_t libdivide_u64_branchfree_recover(const struct libdivide_u64_branchfree_ // the dividend here is 2**(floor_log_2_d + 31), so the low 32 bit word // is 0 and the high word is floor_log_2_d - 1 uint32_t rem, proposed_m; - proposed_m = libdivide_64_div_32_to_32(1U << (floor_log_2_d - 1), 0, absD, &rem); + proposed_m = libdivide_64_div_32_to_32((uint32_t)1 << (floor_log_2_d - 1), 0, absD, &rem); const uint32_t e = absD - rem; // We are going to start with a power of floor_log_2_d - 1. // This works if works if e < 2**floor_log_2_d. - if (!branchfree && e < (1U << floor_log_2_d)) { + if (!branchfree && e < ((uint32_t)1 << floor_log_2_d)) { // This power works - more = floor_log_2_d - 1; + more = (uint8_t)(floor_log_2_d - 1); } else { // We need to go one higher. This should not make proposed_m // overflow, but it will make it negative when interpreted as an @@ -954,7 +1396,7 @@ uint64_t libdivide_u64_branchfree_recover(const struct libdivide_u64_branchfree_ proposed_m += proposed_m; const uint32_t twice_rem = rem + rem; if (twice_rem >= absD || twice_rem < rem) proposed_m += 1; - more = floor_log_2_d | LIBDIVIDE_ADD_MARKER; + more = (uint8_t)(floor_log_2_d | LIBDIVIDE_ADD_MARKER); } proposed_m += 1; @@ -991,7 +1433,7 @@ int32_t libdivide_s32_do(int32_t numer, const struct libdivide_s32_t *denom) { if (!denom->magic) { uint32_t sign = (int8_t)more >> 7; - uint32_t mask = (1U << shift) - 1; + uint32_t mask = ((uint32_t)1 << shift) - 1; uint32_t uq = numer + ((numer >> 31) & mask); int32_t q = (int32_t)uq; q >>= shift; @@ -1027,7 +1469,7 @@ int32_t libdivide_s32_branchfree_do(int32_t numer, const struct libdivide_s32_br // 2, or (2**shift) if it is not a power of 2 uint32_t is_power_of_2 = (magic == 0); uint32_t q_sign = (uint32_t)(q >> 31); - q += q_sign & ((1U << shift) - is_power_of_2); + q += q_sign & (((uint32_t)1 << shift) - is_power_of_2); // Now arithmetic right shift q >>= shift; @@ -1041,7 +1483,7 @@ int32_t libdivide_s32_recover(const struct libdivide_s32_t *denom) { uint8_t more = denom->more; uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK; if (!denom->magic) { - uint32_t absD = 1U << shift; + uint32_t absD = (uint32_t)1 << shift; if (more & LIBDIVIDE_NEGATIVE_DIVISOR) { absD = -absD; } @@ -1055,17 +1497,16 @@ int32_t libdivide_s32_recover(const struct libdivide_s32_t *denom) { // the magic number's sign is opposite that of the divisor. // We want to compute the positive magic number. int negative_divisor = (more & LIBDIVIDE_NEGATIVE_DIVISOR); - int magic_was_negated = (more & LIBDIVIDE_ADD_MARKER) - ? denom->magic > 0 : denom->magic < 0; + int magic_was_negated = (more & LIBDIVIDE_ADD_MARKER) ? denom->magic > 0 : denom->magic < 0; // Handle the power of 2 case (including branchfree) if (denom->magic == 0) { - int32_t result = 1U << shift; + int32_t result = (uint32_t)1 << shift; return negative_divisor ? -result : result; } uint32_t d = (uint32_t)(magic_was_negated ? -denom->magic : denom->magic); - uint64_t n = 1ULL << (32 + shift); // this shift cannot exceed 30 + uint64_t n = (uint64_t)1 << (32 + shift); // this shift cannot exceed 30 uint32_t q = (uint32_t)(n / d); int32_t result = (int32_t)q; result += 1; @@ -1075,11 +1516,12 @@ int32_t libdivide_s32_recover(const struct libdivide_s32_t *denom) { int32_t libdivide_s32_branchfree_recover(const struct libdivide_s32_branchfree_t *denom) { return libdivide_s32_recover((const struct libdivide_s32_t *)denom); -}*/ +} ///////////// SINT64 -/*static inline struct libdivide_s64_t libdivide_internal_s64_gen(int64_t d, int branchfree) { +static LIBDIVIDE_INLINE struct libdivide_s64_t libdivide_internal_s64_gen( + int64_t d, int branchfree) { if (d == 0) { LIBDIVIDE_ERROR("divider must be != 0"); } @@ -1100,20 +1542,20 @@ int32_t libdivide_s32_branchfree_recover(const struct libdivide_s32_branchfree_t if ((absD & (absD - 1)) == 0) { // Branchfree and non-branchfree cases are the same result.magic = 0; - result.more = floor_log_2_d | (d < 0 ? LIBDIVIDE_NEGATIVE_DIVISOR : 0); + result.more = (uint8_t)(floor_log_2_d | (d < 0 ? LIBDIVIDE_NEGATIVE_DIVISOR : 0)); } else { // the dividend here is 2**(floor_log_2_d + 63), so the low 64 bit word // is 0 and the high word is floor_log_2_d - 1 uint8_t more; uint64_t rem, proposed_m; - proposed_m = libdivide_128_div_64_to_64(1ULL << (floor_log_2_d - 1), 0, absD, &rem); + proposed_m = libdivide_128_div_64_to_64((uint64_t)1 << (floor_log_2_d - 1), 0, absD, &rem); const uint64_t e = absD - rem; // We are going to start with a power of floor_log_2_d - 1. // This works if works if e < 2**floor_log_2_d. - if (!branchfree && e < (1ULL << floor_log_2_d)) { + if (!branchfree && e < ((uint64_t)1 << floor_log_2_d)) { // This power works - more = floor_log_2_d - 1; + more = (uint8_t)(floor_log_2_d - 1); } else { // We need to go one higher. This should not make proposed_m // overflow, but it will make it negative when interpreted as an @@ -1125,7 +1567,7 @@ int32_t libdivide_s32_branchfree_recover(const struct libdivide_s32_branchfree_t // also set ADD_MARKER this is an annoying optimization that // enables algorithm #4 to avoid the mask. However we always set it // in the branchfree case - more = floor_log_2_d | LIBDIVIDE_ADD_MARKER; + more = (uint8_t)(floor_log_2_d | LIBDIVIDE_ADD_MARKER); } proposed_m += 1; int64_t magic = (int64_t)proposed_m; @@ -1158,8 +1600,8 @@ int64_t libdivide_s64_do(int64_t numer, const struct libdivide_s64_t *denom) { uint8_t more = denom->more; uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK; - if (!denom->magic) { // shift path - uint64_t mask = (1ULL << shift) - 1; + if (!denom->magic) { // shift path + uint64_t mask = ((uint64_t)1 << shift) - 1; uint64_t uq = numer + ((numer >> 63) & mask); int64_t q = (int64_t)uq; q >>= shift; @@ -1197,7 +1639,7 @@ int64_t libdivide_s64_branchfree_do(int64_t numer, const struct libdivide_s64_br // 2, or (2**shift) if it is not a power of 2. uint64_t is_power_of_2 = (magic == 0); uint64_t q_sign = (uint64_t)(q >> 63); - q += q_sign & ((1ULL << shift) - is_power_of_2); + q += q_sign & (((uint64_t)1 << shift) - is_power_of_2); // Arithmetic right shift q >>= shift; @@ -1210,8 +1652,8 @@ int64_t libdivide_s64_branchfree_do(int64_t numer, const struct libdivide_s64_br int64_t libdivide_s64_recover(const struct libdivide_s64_t *denom) { uint8_t more = denom->more; uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK; - if (denom->magic == 0) { // shift path - uint64_t absD = 1ULL << shift; + if (denom->magic == 0) { // shift path + uint64_t absD = (uint64_t)1 << shift; if (more & LIBDIVIDE_NEGATIVE_DIVISOR) { absD = -absD; } @@ -1219,11 +1661,10 @@ int64_t libdivide_s64_recover(const struct libdivide_s64_t *denom) { } else { // Unsigned math is much easier int negative_divisor = (more & LIBDIVIDE_NEGATIVE_DIVISOR); - int magic_was_negated = (more & LIBDIVIDE_ADD_MARKER) - ? denom->magic > 0 : denom->magic < 0; + int magic_was_negated = (more & LIBDIVIDE_ADD_MARKER) ? denom->magic > 0 : denom->magic < 0; uint64_t d = (uint64_t)(magic_was_negated ? -denom->magic : denom->magic); - uint64_t n_hi = 1ULL << shift, n_lo = 0; + uint64_t n_hi = (uint64_t)1 << shift, n_lo = 0; uint64_t rem_ignored; uint64_t q = libdivide_128_div_64_to_64(n_hi, n_lo, d, &rem_ignored); int64_t result = (int64_t)(q + 1); @@ -1236,32 +1677,364 @@ int64_t libdivide_s64_recover(const struct libdivide_s64_t *denom) { int64_t libdivide_s64_branchfree_recover(const struct libdivide_s64_branchfree_t *denom) { return libdivide_s64_recover((const struct libdivide_s64_t *)denom); -}*/ +} -#if defined(LIBDIVIDE_AVX512) +// Simplest possible vector type division: treat the vector type as an array +// of underlying native type. +// +// Use a union to read a vector via pointer-to-integer, without violating strict +// aliasing. +#define SIMPLE_VECTOR_DIVISION(IntT, VecT, Algo) \ + const size_t count = sizeof(VecT) / sizeof(IntT); \ + union type_pun_vec { \ + VecT vec; \ + IntT arr[sizeof(VecT) / sizeof(IntT)]; \ + }; \ + union type_pun_vec result; \ + union type_pun_vec input; \ + input.vec = numers; \ + for (size_t loop = 0; loop < count; ++loop) { \ + result.arr[loop] = libdivide_##Algo##_do(input.arr[loop], denom); \ + } \ + return result.vec; -static inline __m512i libdivide_u32_do_vector(__m512i numers, const struct libdivide_u32_t *denom); -static inline __m512i libdivide_s32_do_vector(__m512i numers, const struct libdivide_s32_t *denom); -static inline __m512i libdivide_u64_do_vector(__m512i numers, const struct libdivide_u64_t *denom); -static inline __m512i libdivide_s64_do_vector(__m512i numers, const struct libdivide_s64_t *denom); +#if defined(LIBDIVIDE_NEON) -static inline __m512i libdivide_u32_branchfree_do_vector(__m512i numers, const struct libdivide_u32_branchfree_t *denom); -static inline __m512i libdivide_s32_branchfree_do_vector(__m512i numers, const struct libdivide_s32_branchfree_t *denom); -static inline __m512i libdivide_u64_branchfree_do_vector(__m512i numers, const struct libdivide_u64_branchfree_t *denom); -static inline __m512i libdivide_s64_branchfree_do_vector(__m512i numers, const struct libdivide_s64_branchfree_t *denom); +static LIBDIVIDE_INLINE uint16x8_t libdivide_u16_do_vec128( + uint16x8_t numers, const struct libdivide_u16_t *denom); +static LIBDIVIDE_INLINE int16x8_t libdivide_s16_do_vec128( + int16x8_t numers, const struct libdivide_s16_t *denom); +static LIBDIVIDE_INLINE uint32x4_t libdivide_u32_do_vec128( + uint32x4_t numers, const struct libdivide_u32_t *denom); +static LIBDIVIDE_INLINE int32x4_t libdivide_s32_do_vec128( + int32x4_t numers, const struct libdivide_s32_t *denom); +static LIBDIVIDE_INLINE uint64x2_t libdivide_u64_do_vec128( + uint64x2_t numers, const struct libdivide_u64_t *denom); +static LIBDIVIDE_INLINE int64x2_t libdivide_s64_do_vec128( + int64x2_t numers, const struct libdivide_s64_t *denom); + +static LIBDIVIDE_INLINE uint16x8_t libdivide_u16_branchfree_do_vec128( + uint16x8_t numers, const struct libdivide_u16_branchfree_t *denom); +static LIBDIVIDE_INLINE int16x8_t libdivide_s16_branchfree_do_vec128( + int16x8_t numers, const struct libdivide_s16_branchfree_t *denom); +static LIBDIVIDE_INLINE uint32x4_t libdivide_u32_branchfree_do_vec128( + uint32x4_t numers, const struct libdivide_u32_branchfree_t *denom); +static LIBDIVIDE_INLINE int32x4_t libdivide_s32_branchfree_do_vec128( + int32x4_t numers, const struct libdivide_s32_branchfree_t *denom); +static LIBDIVIDE_INLINE uint64x2_t libdivide_u64_branchfree_do_vec128( + uint64x2_t numers, const struct libdivide_u64_branchfree_t *denom); +static LIBDIVIDE_INLINE int64x2_t libdivide_s64_branchfree_do_vec128( + int64x2_t numers, const struct libdivide_s64_branchfree_t *denom); //////// Internal Utility Functions -static inline __m512i libdivide_s64_signbits(__m512i v) {; +// Logical right shift by runtime value. +// NEON implements right shift as left shits by negative values. +static LIBDIVIDE_INLINE uint32x4_t libdivide_u32_neon_srl(uint32x4_t v, uint8_t amt) { + int32_t wamt = (int32_t)(amt); + return vshlq_u32(v, vdupq_n_s32(-wamt)); +} + +static LIBDIVIDE_INLINE uint64x2_t libdivide_u64_neon_srl(uint64x2_t v, uint8_t amt) { + int64_t wamt = (int64_t)(amt); + return vshlq_u64(v, vdupq_n_s64(-wamt)); +} + +// Arithmetic right shift by runtime value. +static LIBDIVIDE_INLINE int32x4_t libdivide_s32_neon_sra(int32x4_t v, uint8_t amt) { + int32_t wamt = (int32_t)(amt); + return vshlq_s32(v, vdupq_n_s32(-wamt)); +} + +static LIBDIVIDE_INLINE int64x2_t libdivide_s64_neon_sra(int64x2_t v, uint8_t amt) { + int64_t wamt = (int64_t)(amt); + return vshlq_s64(v, vdupq_n_s64(-wamt)); +} + +static LIBDIVIDE_INLINE int64x2_t libdivide_s64_signbits(int64x2_t v) { return vshrq_n_s64(v, 63); } + +static LIBDIVIDE_INLINE uint32x4_t libdivide_mullhi_u32_vec128(uint32x4_t a, uint32_t b) { + // Desire is [x0, x1, x2, x3] + uint32x4_t w1 = vreinterpretq_u32_u64(vmull_n_u32(vget_low_u32(a), b)); // [_, x0, _, x1] + uint32x4_t w2 = vreinterpretq_u32_u64(vmull_high_n_u32(a, b)); //[_, x2, _, x3] + return vuzp2q_u32(w1, w2); // [x0, x1, x2, x3] +} + +static LIBDIVIDE_INLINE int32x4_t libdivide_mullhi_s32_vec128(int32x4_t a, int32_t b) { + int32x4_t w1 = vreinterpretq_s32_s64(vmull_n_s32(vget_low_s32(a), b)); // [_, x0, _, x1] + int32x4_t w2 = vreinterpretq_s32_s64(vmull_high_n_s32(a, b)); //[_, x2, _, x3] + return vuzp2q_s32(w1, w2); // [x0, x1, x2, x3] +} + +static LIBDIVIDE_INLINE uint64x2_t libdivide_mullhi_u64_vec128(uint64x2_t x, uint64_t sy) { + // full 128 bits product is: + // x0*y0 + (x0*y1 << 32) + (x1*y0 << 32) + (x1*y1 << 64) + // Note x0,y0,x1,y1 are all conceptually uint32, products are 32x32->64. + + // Get low and high words. x0 contains low 32 bits, x1 is high 32 bits. + uint64x2_t y = vdupq_n_u64(sy); + uint32x2_t x0 = vmovn_u64(x); + uint32x2_t y0 = vmovn_u64(y); + uint32x2_t x1 = vshrn_n_u64(x, 32); + uint32x2_t y1 = vshrn_n_u64(y, 32); + + // Compute x0*y0. + uint64x2_t x0y0 = vmull_u32(x0, y0); + uint64x2_t x0y0_hi = vshrq_n_u64(x0y0, 32); + + // Compute other intermediate products. + uint64x2_t temp = vmlal_u32(x0y0_hi, x1, y0); // temp = x0y0_hi + x1*y0; + // We want to split temp into its low 32 bits and high 32 bits, both + // in the low half of 64 bit registers. + // Use shifts to avoid needing a reg for the mask. + uint64x2_t temp_lo = vshrq_n_u64(vshlq_n_u64(temp, 32), 32); // temp_lo = temp & 0xFFFFFFFF; + uint64x2_t temp_hi = vshrq_n_u64(temp, 32); // temp_hi = temp >> 32; + + temp_lo = vmlal_u32(temp_lo, x0, y1); // temp_lo += x0*y0 + temp_lo = vshrq_n_u64(temp_lo, 32); // temp_lo >>= 32 + temp_hi = vmlal_u32(temp_hi, x1, y1); // temp_hi += x1*y1 + uint64x2_t result = vaddq_u64(temp_hi, temp_lo); + return result; +} + +static LIBDIVIDE_INLINE int64x2_t libdivide_mullhi_s64_vec128(int64x2_t x, int64_t sy) { + int64x2_t p = vreinterpretq_s64_u64( + libdivide_mullhi_u64_vec128(vreinterpretq_u64_s64(x), (uint64_t)(sy))); + int64x2_t y = vdupq_n_s64(sy); + int64x2_t t1 = vandq_s64(libdivide_s64_signbits(x), y); + int64x2_t t2 = vandq_s64(libdivide_s64_signbits(y), x); + p = vsubq_s64(p, t1); + p = vsubq_s64(p, t2); + return p; +} + +////////// UINT16 + +uint16x8_t libdivide_u16_do_vec128(uint16x8_t numers, const struct libdivide_u16_t *denom){ + SIMPLE_VECTOR_DIVISION(uint16_t, uint16x8_t, u16)} + +uint16x8_t libdivide_u16_branchfree_do_vec128( + uint16x8_t numers, const struct libdivide_u16_branchfree_t *denom){ + SIMPLE_VECTOR_DIVISION(uint16_t, uint16x8_t, u16_branchfree)} + +////////// UINT32 + +uint32x4_t libdivide_u32_do_vec128(uint32x4_t numers, const struct libdivide_u32_t *denom) { + uint8_t more = denom->more; + if (!denom->magic) { + return libdivide_u32_neon_srl(numers, more); + } else { + uint32x4_t q = libdivide_mullhi_u32_vec128(numers, denom->magic); + if (more & LIBDIVIDE_ADD_MARKER) { + // uint32_t t = ((numer - q) >> 1) + q; + // return t >> denom->shift; + // Note we can use halving-subtract to avoid the shift. + uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK; + uint32x4_t t = vaddq_u32(vhsubq_u32(numers, q), q); + return libdivide_u32_neon_srl(t, shift); + } else { + return libdivide_u32_neon_srl(q, more); + } + } +} + +uint32x4_t libdivide_u32_branchfree_do_vec128( + uint32x4_t numers, const struct libdivide_u32_branchfree_t *denom) { + uint32x4_t q = libdivide_mullhi_u32_vec128(numers, denom->magic); + uint32x4_t t = vaddq_u32(vhsubq_u32(numers, q), q); + return libdivide_u32_neon_srl(t, denom->more); +} + +////////// UINT64 + +uint64x2_t libdivide_u64_do_vec128(uint64x2_t numers, const struct libdivide_u64_t *denom) { + uint8_t more = denom->more; + if (!denom->magic) { + return libdivide_u64_neon_srl(numers, more); + } else { + uint64x2_t q = libdivide_mullhi_u64_vec128(numers, denom->magic); + if (more & LIBDIVIDE_ADD_MARKER) { + // uint32_t t = ((numer - q) >> 1) + q; + // return t >> denom->shift; + // No 64-bit halving subtracts in NEON :( + uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK; + uint64x2_t t = vaddq_u64(vshrq_n_u64(vsubq_u64(numers, q), 1), q); + return libdivide_u64_neon_srl(t, shift); + } else { + return libdivide_u64_neon_srl(q, more); + } + } +} + +uint64x2_t libdivide_u64_branchfree_do_vec128( + uint64x2_t numers, const struct libdivide_u64_branchfree_t *denom) { + uint64x2_t q = libdivide_mullhi_u64_vec128(numers, denom->magic); + uint64x2_t t = vaddq_u64(vshrq_n_u64(vsubq_u64(numers, q), 1), q); + return libdivide_u64_neon_srl(t, denom->more); +} + +////////// SINT16 + +int16x8_t libdivide_s16_do_vec128(int16x8_t numers, const struct libdivide_s16_t *denom){ + SIMPLE_VECTOR_DIVISION(int16_t, int16x8_t, s16)} + +int16x8_t libdivide_s16_branchfree_do_vec128( + int16x8_t numers, const struct libdivide_s16_branchfree_t *denom){ + SIMPLE_VECTOR_DIVISION(int16_t, int16x8_t, s16_branchfree)} + +////////// SINT32 + +int32x4_t libdivide_s32_do_vec128(int32x4_t numers, const struct libdivide_s32_t *denom) { + uint8_t more = denom->more; + if (!denom->magic) { + uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK; + uint32_t mask = ((uint32_t)1 << shift) - 1; + int32x4_t roundToZeroTweak = vdupq_n_s32((int)mask); + // q = numer + ((numer >> 31) & roundToZeroTweak); + int32x4_t q = vaddq_s32(numers, vandq_s32(vshrq_n_s32(numers, 31), roundToZeroTweak)); + q = libdivide_s32_neon_sra(q, shift); + int32x4_t sign = vdupq_n_s32((int8_t)more >> 7); + // q = (q ^ sign) - sign; + q = vsubq_s32(veorq_s32(q, sign), sign); + return q; + } else { + int32x4_t q = libdivide_mullhi_s32_vec128(numers, denom->magic); + if (more & LIBDIVIDE_ADD_MARKER) { + // must be arithmetic shift + int32x4_t sign = vdupq_n_s32((int8_t)more >> 7); + // q += ((numer ^ sign) - sign); + q = vaddq_s32(q, vsubq_s32(veorq_s32(numers, sign), sign)); + } + // q >>= shift + q = libdivide_s32_neon_sra(q, more & LIBDIVIDE_32_SHIFT_MASK); + q = vaddq_s32( + q, vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(q), 31))); // q += (q < 0) + return q; + } +} + +int32x4_t libdivide_s32_branchfree_do_vec128( + int32x4_t numers, const struct libdivide_s32_branchfree_t *denom) { + int32_t magic = denom->magic; + uint8_t more = denom->more; + uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK; + // must be arithmetic shift + int32x4_t sign = vdupq_n_s32((int8_t)more >> 7); + int32x4_t q = libdivide_mullhi_s32_vec128(numers, magic); + q = vaddq_s32(q, numers); // q += numers + + // If q is non-negative, we have nothing to do + // If q is negative, we want to add either (2**shift)-1 if d is + // a power of 2, or (2**shift) if it is not a power of 2 + uint32_t is_power_of_2 = (magic == 0); + int32x4_t q_sign = vshrq_n_s32(q, 31); // q_sign = q >> 31 + int32x4_t mask = vdupq_n_s32(((uint32_t)1 << shift) - is_power_of_2); + q = vaddq_s32(q, vandq_s32(q_sign, mask)); // q = q + (q_sign & mask) + q = libdivide_s32_neon_sra(q, shift); // q >>= shift + q = vsubq_s32(veorq_s32(q, sign), sign); // q = (q ^ sign) - sign + return q; +} + +////////// SINT64 + +int64x2_t libdivide_s64_do_vec128(int64x2_t numers, const struct libdivide_s64_t *denom) { + uint8_t more = denom->more; + int64_t magic = denom->magic; + if (magic == 0) { // shift path + uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK; + uint64_t mask = ((uint64_t)1 << shift) - 1; + int64x2_t roundToZeroTweak = vdupq_n_s64(mask); // TODO: no need to sign extend + // q = numer + ((numer >> 63) & roundToZeroTweak); + int64x2_t q = + vaddq_s64(numers, vandq_s64(libdivide_s64_signbits(numers), roundToZeroTweak)); + q = libdivide_s64_neon_sra(q, shift); + // q = (q ^ sign) - sign; + int64x2_t sign = vreinterpretq_s64_s8(vdupq_n_s8((int8_t)more >> 7)); + q = vsubq_s64(veorq_s64(q, sign), sign); + return q; + } else { + int64x2_t q = libdivide_mullhi_s64_vec128(numers, magic); + if (more & LIBDIVIDE_ADD_MARKER) { + // must be arithmetic shift + int64x2_t sign = vdupq_n_s64((int8_t)more >> 7); // TODO: no need to widen + // q += ((numer ^ sign) - sign); + q = vaddq_s64(q, vsubq_s64(veorq_s64(numers, sign), sign)); + } + // q >>= denom->mult_path.shift + q = libdivide_s64_neon_sra(q, more & LIBDIVIDE_64_SHIFT_MASK); + q = vaddq_s64( + q, vreinterpretq_s64_u64(vshrq_n_u64(vreinterpretq_u64_s64(q), 63))); // q += (q < 0) + return q; + } +} + +int64x2_t libdivide_s64_branchfree_do_vec128( + int64x2_t numers, const struct libdivide_s64_branchfree_t *denom) { + int64_t magic = denom->magic; + uint8_t more = denom->more; + uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK; + // must be arithmetic shift + int64x2_t sign = vdupq_n_s64((int8_t)more >> 7); // TODO: avoid sign extend + + // libdivide_mullhi_s64(numers, magic); + int64x2_t q = libdivide_mullhi_s64_vec128(numers, magic); + q = vaddq_s64(q, numers); // q += numers + + // If q is non-negative, we have nothing to do. + // If q is negative, we want to add either (2**shift)-1 if d is + // a power of 2, or (2**shift) if it is not a power of 2. + uint32_t is_power_of_2 = (magic == 0); + int64x2_t q_sign = libdivide_s64_signbits(q); // q_sign = q >> 63 + int64x2_t mask = vdupq_n_s64(((uint64_t)1 << shift) - is_power_of_2); + q = vaddq_s64(q, vandq_s64(q_sign, mask)); // q = q + (q_sign & mask) + q = libdivide_s64_neon_sra(q, shift); // q >>= shift + q = vsubq_s64(veorq_s64(q, sign), sign); // q = (q ^ sign) - sign + return q; +} + +#endif + +#if defined(LIBDIVIDE_AVX512) + +static LIBDIVIDE_INLINE __m512i libdivide_u16_do_vec512( + __m512i numers, const struct libdivide_u16_t *denom); +static LIBDIVIDE_INLINE __m512i libdivide_s16_do_vec512( + __m512i numers, const struct libdivide_s16_t *denom); +static LIBDIVIDE_INLINE __m512i libdivide_u32_do_vec512( + __m512i numers, const struct libdivide_u32_t *denom); +static LIBDIVIDE_INLINE __m512i libdivide_s32_do_vec512( + __m512i numers, const struct libdivide_s32_t *denom); +static LIBDIVIDE_INLINE __m512i libdivide_u64_do_vec512( + __m512i numers, const struct libdivide_u64_t *denom); +static LIBDIVIDE_INLINE __m512i libdivide_s64_do_vec512( + __m512i numers, const struct libdivide_s64_t *denom); + +static LIBDIVIDE_INLINE __m512i libdivide_u16_branchfree_do_vec512( + __m512i numers, const struct libdivide_u16_branchfree_t *denom); +static LIBDIVIDE_INLINE __m512i libdivide_s16_branchfree_do_vec512( + __m512i numers, const struct libdivide_s16_branchfree_t *denom); +static LIBDIVIDE_INLINE __m512i libdivide_u32_branchfree_do_vec512( + __m512i numers, const struct libdivide_u32_branchfree_t *denom); +static LIBDIVIDE_INLINE __m512i libdivide_s32_branchfree_do_vec512( + __m512i numers, const struct libdivide_s32_branchfree_t *denom); +static LIBDIVIDE_INLINE __m512i libdivide_u64_branchfree_do_vec512( + __m512i numers, const struct libdivide_u64_branchfree_t *denom); +static LIBDIVIDE_INLINE __m512i libdivide_s64_branchfree_do_vec512( + __m512i numers, const struct libdivide_s64_branchfree_t *denom); + +//////// Internal Utility Functions + +static LIBDIVIDE_INLINE __m512i libdivide_s64_signbits_vec512(__m512i v) { + ; return _mm512_srai_epi64(v, 63); } -static inline __m512i libdivide_s64_shift_right_vector(__m512i v, int amt) { +static LIBDIVIDE_INLINE __m512i libdivide_s64_shift_right_vec512(__m512i v, int amt) { return _mm512_srai_epi64(v, amt); } // Here, b is assumed to contain one 32-bit value repeated. -static inline __m512i libdivide_mullhi_u32_vector(__m512i a, __m512i b) { +static LIBDIVIDE_INLINE __m512i libdivide_mullhi_u32_vec512(__m512i a, __m512i b) { __m512i hi_product_0Z2Z = _mm512_srli_epi64(_mm512_mul_epu32(a, b), 32); __m512i a1X3X = _mm512_srli_epi64(a, 32); __m512i mask = _mm512_set_epi32(-1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0); @@ -1270,7 +2043,7 @@ static inline __m512i libdivide_mullhi_u32_vector(__m512i a, __m512i b) { } // b is one 32-bit value repeated. -static inline __m512i libdivide_mullhi_s32_vector(__m512i a, __m512i b) { +static LIBDIVIDE_INLINE __m512i libdivide_mullhi_s32_vec512(__m512i a, __m512i b) { __m512i hi_product_0Z2Z = _mm512_srli_epi64(_mm512_mul_epi32(a, b), 32); __m512i a1X3X = _mm512_srli_epi64(a, 32); __m512i mask = _mm512_set_epi32(-1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0); @@ -1279,164 +2052,182 @@ static inline __m512i libdivide_mullhi_s32_vector(__m512i a, __m512i b) { } // Here, y is assumed to contain one 64-bit value repeated. -// https://stackoverflow.com/a/28827013 -static inline __m512i libdivide_mullhi_u64_vector(__m512i x, __m512i y) { - __m512i lomask = _mm512_set1_epi64(0xffffffff); - __m512i xh = _mm512_shuffle_epi32(x, (_MM_PERM_ENUM) 0xB1); - __m512i yh = _mm512_shuffle_epi32(y, (_MM_PERM_ENUM) 0xB1); - __m512i w0 = _mm512_mul_epu32(x, y); - __m512i w1 = _mm512_mul_epu32(x, yh); - __m512i w2 = _mm512_mul_epu32(xh, y); - __m512i w3 = _mm512_mul_epu32(xh, yh); - __m512i w0h = _mm512_srli_epi64(w0, 32); - __m512i s1 = _mm512_add_epi64(w1, w0h); - __m512i s1l = _mm512_and_si512(s1, lomask); - __m512i s1h = _mm512_srli_epi64(s1, 32); - __m512i s2 = _mm512_add_epi64(w2, s1l); - __m512i s2h = _mm512_srli_epi64(s2, 32); - __m512i hi = _mm512_add_epi64(w3, s1h); - hi = _mm512_add_epi64(hi, s2h); +static LIBDIVIDE_INLINE __m512i libdivide_mullhi_u64_vec512(__m512i x, __m512i y) { + // see m128i variant for comments. + __m512i x0y0 = _mm512_mul_epu32(x, y); + __m512i x0y0_hi = _mm512_srli_epi64(x0y0, 32); - return hi; + __m512i x1 = _mm512_shuffle_epi32(x, (_MM_PERM_ENUM)_MM_SHUFFLE(3, 3, 1, 1)); + __m512i y1 = _mm512_shuffle_epi32(y, (_MM_PERM_ENUM)_MM_SHUFFLE(3, 3, 1, 1)); + + __m512i x0y1 = _mm512_mul_epu32(x, y1); + __m512i x1y0 = _mm512_mul_epu32(x1, y); + __m512i x1y1 = _mm512_mul_epu32(x1, y1); + + __m512i mask = _mm512_set1_epi64(0xFFFFFFFF); + __m512i temp = _mm512_add_epi64(x1y0, x0y0_hi); + __m512i temp_lo = _mm512_and_si512(temp, mask); + __m512i temp_hi = _mm512_srli_epi64(temp, 32); + + temp_lo = _mm512_srli_epi64(_mm512_add_epi64(temp_lo, x0y1), 32); + temp_hi = _mm512_add_epi64(x1y1, temp_hi); + return _mm512_add_epi64(temp_lo, temp_hi); } // y is one 64-bit value repeated. -static inline __m512i libdivide_mullhi_s64_vector(__m512i x, __m512i y) { - __m512i p = libdivide_mullhi_u64_vector(x, y); - __m512i t1 = _mm512_and_si512(libdivide_s64_signbits(x), y); - __m512i t2 = _mm512_and_si512(libdivide_s64_signbits(y), x); +static LIBDIVIDE_INLINE __m512i libdivide_mullhi_s64_vec512(__m512i x, __m512i y) { + __m512i p = libdivide_mullhi_u64_vec512(x, y); + __m512i t1 = _mm512_and_si512(libdivide_s64_signbits_vec512(x), y); + __m512i t2 = _mm512_and_si512(libdivide_s64_signbits_vec512(y), x); p = _mm512_sub_epi64(p, t1); p = _mm512_sub_epi64(p, t2); return p; } +////////// UINT16 + +__m512i libdivide_u16_do_vec512(__m512i numers, const struct libdivide_u16_t *denom){ + SIMPLE_VECTOR_DIVISION(uint16_t, __m512i, u16)} + +__m512i libdivide_u16_branchfree_do_vec512( + __m512i numers, const struct libdivide_u16_branchfree_t *denom){ + SIMPLE_VECTOR_DIVISION(uint16_t, __m512i, u16_branchfree)} + ////////// UINT32 -__m512i libdivide_u32_do_vector(__m512i numers, const struct libdivide_u32_t *denom) { +__m512i libdivide_u32_do_vec512(__m512i numers, const struct libdivide_u32_t *denom) { uint8_t more = denom->more; if (!denom->magic) { return _mm512_srli_epi32(numers, more); - } - else { - __m512i q = libdivide_mullhi_u32_vector(numers, _mm512_set1_epi32(denom->magic)); + } else { + __m512i q = libdivide_mullhi_u32_vec512(numers, _mm512_set1_epi32(denom->magic)); if (more & LIBDIVIDE_ADD_MARKER) { // uint32_t t = ((numer - q) >> 1) + q; // return t >> denom->shift; uint32_t shift = more & LIBDIVIDE_32_SHIFT_MASK; __m512i t = _mm512_add_epi32(_mm512_srli_epi32(_mm512_sub_epi32(numers, q), 1), q); return _mm512_srli_epi32(t, shift); - } - else { + } else { return _mm512_srli_epi32(q, more); } } } -__m512i libdivide_u32_branchfree_do_vector(__m512i numers, const struct libdivide_u32_branchfree_t *denom) { - __m512i q = libdivide_mullhi_u32_vector(numers, _mm512_set1_epi32(denom->magic)); +__m512i libdivide_u32_branchfree_do_vec512( + __m512i numers, const struct libdivide_u32_branchfree_t *denom) { + __m512i q = libdivide_mullhi_u32_vec512(numers, _mm512_set1_epi32(denom->magic)); __m512i t = _mm512_add_epi32(_mm512_srli_epi32(_mm512_sub_epi32(numers, q), 1), q); return _mm512_srli_epi32(t, denom->more); } ////////// UINT64 -__m512i libdivide_u64_do_vector(__m512i numers, const struct libdivide_u64_t *denom) { +__m512i libdivide_u64_do_vec512(__m512i numers, const struct libdivide_u64_t *denom) { uint8_t more = denom->more; if (!denom->magic) { return _mm512_srli_epi64(numers, more); - } - else { - __m512i q = libdivide_mullhi_u64_vector(numers, _mm512_set1_epi64(denom->magic)); + } else { + __m512i q = libdivide_mullhi_u64_vec512(numers, _mm512_set1_epi64(denom->magic)); if (more & LIBDIVIDE_ADD_MARKER) { // uint32_t t = ((numer - q) >> 1) + q; // return t >> denom->shift; uint32_t shift = more & LIBDIVIDE_64_SHIFT_MASK; __m512i t = _mm512_add_epi64(_mm512_srli_epi64(_mm512_sub_epi64(numers, q), 1), q); return _mm512_srli_epi64(t, shift); - } - else { + } else { return _mm512_srli_epi64(q, more); } } } -__m512i libdivide_u64_branchfree_do_vector(__m512i numers, const struct libdivide_u64_branchfree_t *denom) { - __m512i q = libdivide_mullhi_u64_vector(numers, _mm512_set1_epi64(denom->magic)); +__m512i libdivide_u64_branchfree_do_vec512( + __m512i numers, const struct libdivide_u64_branchfree_t *denom) { + __m512i q = libdivide_mullhi_u64_vec512(numers, _mm512_set1_epi64(denom->magic)); __m512i t = _mm512_add_epi64(_mm512_srli_epi64(_mm512_sub_epi64(numers, q), 1), q); return _mm512_srli_epi64(t, denom->more); } +////////// SINT16 + +__m512i libdivide_s16_do_vec512(__m512i numers, const struct libdivide_s16_t *denom){ + SIMPLE_VECTOR_DIVISION(int16_t, __m512i, s16)} + +__m512i libdivide_s16_branchfree_do_vec512( + __m512i numers, const struct libdivide_s16_branchfree_t *denom){ + SIMPLE_VECTOR_DIVISION(int16_t, __m512i, s16_branchfree)} + ////////// SINT32 -__m512i libdivide_s32_do_vector(__m512i numers, const struct libdivide_s32_t *denom) { +__m512i libdivide_s32_do_vec512(__m512i numers, const struct libdivide_s32_t *denom) { uint8_t more = denom->more; if (!denom->magic) { uint32_t shift = more & LIBDIVIDE_32_SHIFT_MASK; - uint32_t mask = (1U << shift) - 1; + uint32_t mask = ((uint32_t)1 << shift) - 1; __m512i roundToZeroTweak = _mm512_set1_epi32(mask); // q = numer + ((numer >> 31) & roundToZeroTweak); - __m512i q = _mm512_add_epi32(numers, _mm512_and_si512(_mm512_srai_epi32(numers, 31), roundToZeroTweak)); + __m512i q = _mm512_add_epi32( + numers, _mm512_and_si512(_mm512_srai_epi32(numers, 31), roundToZeroTweak)); q = _mm512_srai_epi32(q, shift); __m512i sign = _mm512_set1_epi32((int8_t)more >> 7); // q = (q ^ sign) - sign; q = _mm512_sub_epi32(_mm512_xor_si512(q, sign), sign); return q; - } - else { - __m512i q = libdivide_mullhi_s32_vector(numers, _mm512_set1_epi32(denom->magic)); + } else { + __m512i q = libdivide_mullhi_s32_vec512(numers, _mm512_set1_epi32(denom->magic)); if (more & LIBDIVIDE_ADD_MARKER) { - // must be arithmetic shift + // must be arithmetic shift __m512i sign = _mm512_set1_epi32((int8_t)more >> 7); - // q += ((numer ^ sign) - sign); + // q += ((numer ^ sign) - sign); q = _mm512_add_epi32(q, _mm512_sub_epi32(_mm512_xor_si512(numers, sign), sign)); } // q >>= shift q = _mm512_srai_epi32(q, more & LIBDIVIDE_32_SHIFT_MASK); - q = _mm512_add_epi32(q, _mm512_srli_epi32(q, 31)); // q += (q < 0) + q = _mm512_add_epi32(q, _mm512_srli_epi32(q, 31)); // q += (q < 0) return q; } } -__m512i libdivide_s32_branchfree_do_vector(__m512i numers, const struct libdivide_s32_branchfree_t *denom) { +__m512i libdivide_s32_branchfree_do_vec512( + __m512i numers, const struct libdivide_s32_branchfree_t *denom) { int32_t magic = denom->magic; uint8_t more = denom->more; uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK; - // must be arithmetic shift + // must be arithmetic shift __m512i sign = _mm512_set1_epi32((int8_t)more >> 7); - __m512i q = libdivide_mullhi_s32_vector(numers, _mm512_set1_epi32(magic)); - q = _mm512_add_epi32(q, numers); // q += numers + __m512i q = libdivide_mullhi_s32_vec512(numers, _mm512_set1_epi32(magic)); + q = _mm512_add_epi32(q, numers); // q += numers // If q is non-negative, we have nothing to do // If q is negative, we want to add either (2**shift)-1 if d is // a power of 2, or (2**shift) if it is not a power of 2 uint32_t is_power_of_2 = (magic == 0); - __m512i q_sign = _mm512_srai_epi32(q, 31); // q_sign = q >> 31 - __m512i mask = _mm512_set1_epi32((1U << shift) - is_power_of_2); - q = _mm512_add_epi32(q, _mm512_and_si512(q_sign, mask)); // q = q + (q_sign & mask) - q = _mm512_srai_epi32(q, shift); // q >>= shift - q = _mm512_sub_epi32(_mm512_xor_si512(q, sign), sign); // q = (q ^ sign) - sign + __m512i q_sign = _mm512_srai_epi32(q, 31); // q_sign = q >> 31 + __m512i mask = _mm512_set1_epi32(((uint32_t)1 << shift) - is_power_of_2); + q = _mm512_add_epi32(q, _mm512_and_si512(q_sign, mask)); // q = q + (q_sign & mask) + q = _mm512_srai_epi32(q, shift); // q >>= shift + q = _mm512_sub_epi32(_mm512_xor_si512(q, sign), sign); // q = (q ^ sign) - sign return q; } ////////// SINT64 -__m512i libdivide_s64_do_vector(__m512i numers, const struct libdivide_s64_t *denom) { +__m512i libdivide_s64_do_vec512(__m512i numers, const struct libdivide_s64_t *denom) { uint8_t more = denom->more; int64_t magic = denom->magic; - if (magic == 0) { // shift path + if (magic == 0) { // shift path uint32_t shift = more & LIBDIVIDE_64_SHIFT_MASK; - uint64_t mask = (1ULL << shift) - 1; + uint64_t mask = ((uint64_t)1 << shift) - 1; __m512i roundToZeroTweak = _mm512_set1_epi64(mask); // q = numer + ((numer >> 63) & roundToZeroTweak); - __m512i q = _mm512_add_epi64(numers, _mm512_and_si512(libdivide_s64_signbits(numers), roundToZeroTweak)); - q = libdivide_s64_shift_right_vector(q, shift); + __m512i q = _mm512_add_epi64( + numers, _mm512_and_si512(libdivide_s64_signbits_vec512(numers), roundToZeroTweak)); + q = libdivide_s64_shift_right_vec512(q, shift); __m512i sign = _mm512_set1_epi32((int8_t)more >> 7); - // q = (q ^ sign) - sign; + // q = (q ^ sign) - sign; q = _mm512_sub_epi64(_mm512_xor_si512(q, sign), sign); return q; - } - else { - __m512i q = libdivide_mullhi_s64_vector(numers, _mm512_set1_epi64(magic)); + } else { + __m512i q = libdivide_mullhi_s64_vec512(numers, _mm512_set1_epi64(magic)); if (more & LIBDIVIDE_ADD_MARKER) { // must be arithmetic shift __m512i sign = _mm512_set1_epi32((int8_t)more >> 7); @@ -1444,67 +2235,86 @@ __m512i libdivide_s64_do_vector(__m512i numers, const struct libdivide_s64_t *de q = _mm512_add_epi64(q, _mm512_sub_epi64(_mm512_xor_si512(numers, sign), sign)); } // q >>= denom->mult_path.shift - q = libdivide_s64_shift_right_vector(q, more & LIBDIVIDE_64_SHIFT_MASK); - q = _mm512_add_epi64(q, _mm512_srli_epi64(q, 63)); // q += (q < 0) + q = libdivide_s64_shift_right_vec512(q, more & LIBDIVIDE_64_SHIFT_MASK); + q = _mm512_add_epi64(q, _mm512_srli_epi64(q, 63)); // q += (q < 0) return q; } } -__m512i libdivide_s64_branchfree_do_vector(__m512i numers, const struct libdivide_s64_branchfree_t *denom) { +__m512i libdivide_s64_branchfree_do_vec512( + __m512i numers, const struct libdivide_s64_branchfree_t *denom) { int64_t magic = denom->magic; uint8_t more = denom->more; uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK; // must be arithmetic shift __m512i sign = _mm512_set1_epi32((int8_t)more >> 7); - // libdivide_mullhi_s64(numers, magic); - __m512i q = libdivide_mullhi_s64_vector(numers, _mm512_set1_epi64(magic)); - q = _mm512_add_epi64(q, numers); // q += numers + // libdivide_mullhi_s64(numers, magic); + __m512i q = libdivide_mullhi_s64_vec512(numers, _mm512_set1_epi64(magic)); + q = _mm512_add_epi64(q, numers); // q += numers // If q is non-negative, we have nothing to do. // If q is negative, we want to add either (2**shift)-1 if d is // a power of 2, or (2**shift) if it is not a power of 2. uint32_t is_power_of_2 = (magic == 0); - __m512i q_sign = libdivide_s64_signbits(q); // q_sign = q >> 63 - __m512i mask = _mm512_set1_epi64((1ULL << shift) - is_power_of_2); - q = _mm512_add_epi64(q, _mm512_and_si512(q_sign, mask)); // q = q + (q_sign & mask) - q = libdivide_s64_shift_right_vector(q, shift); // q >>= shift - q = _mm512_sub_epi64(_mm512_xor_si512(q, sign), sign); // q = (q ^ sign) - sign + __m512i q_sign = libdivide_s64_signbits_vec512(q); // q_sign = q >> 63 + __m512i mask = _mm512_set1_epi64(((uint64_t)1 << shift) - is_power_of_2); + q = _mm512_add_epi64(q, _mm512_and_si512(q_sign, mask)); // q = q + (q_sign & mask) + q = libdivide_s64_shift_right_vec512(q, shift); // q >>= shift + q = _mm512_sub_epi64(_mm512_xor_si512(q, sign), sign); // q = (q ^ sign) - sign return q; } -#elif defined(LIBDIVIDE_AVX2) +#endif -static inline __m256i libdivide_u32_do_vector(__m256i numers, const struct libdivide_u32_t *denom); -static inline __m256i libdivide_s32_do_vector(__m256i numers, const struct libdivide_s32_t *denom); -static inline __m256i libdivide_u64_do_vector(__m256i numers, const struct libdivide_u64_t *denom); -static inline __m256i libdivide_s64_do_vector(__m256i numers, const struct libdivide_s64_t *denom); +#if defined(LIBDIVIDE_AVX2) -static inline __m256i libdivide_u32_branchfree_do_vector(__m256i numers, const struct libdivide_u32_branchfree_t *denom); -static inline __m256i libdivide_s32_branchfree_do_vector(__m256i numers, const struct libdivide_s32_branchfree_t *denom); -static inline __m256i libdivide_u64_branchfree_do_vector(__m256i numers, const struct libdivide_u64_branchfree_t *denom); -static inline __m256i libdivide_s64_branchfree_do_vector(__m256i numers, const struct libdivide_s64_branchfree_t *denom); +static LIBDIVIDE_INLINE __m256i libdivide_u16_do_vec256( + __m256i numers, const struct libdivide_u16_t *denom); +static LIBDIVIDE_INLINE __m256i libdivide_s16_do_vec256( + __m256i numers, const struct libdivide_s16_t *denom); +static LIBDIVIDE_INLINE __m256i libdivide_u32_do_vec256( + __m256i numers, const struct libdivide_u32_t *denom); +static LIBDIVIDE_INLINE __m256i libdivide_s32_do_vec256( + __m256i numers, const struct libdivide_s32_t *denom); +static LIBDIVIDE_INLINE __m256i libdivide_u64_do_vec256( + __m256i numers, const struct libdivide_u64_t *denom); +static LIBDIVIDE_INLINE __m256i libdivide_s64_do_vec256( + __m256i numers, const struct libdivide_s64_t *denom); + +static LIBDIVIDE_INLINE __m256i libdivide_u16_branchfree_do_vec256( + __m256i numers, const struct libdivide_u16_branchfree_t *denom); +static LIBDIVIDE_INLINE __m256i libdivide_s16_branchfree_do_vec256( + __m256i numers, const struct libdivide_s16_branchfree_t *denom); +static LIBDIVIDE_INLINE __m256i libdivide_u32_branchfree_do_vec256( + __m256i numers, const struct libdivide_u32_branchfree_t *denom); +static LIBDIVIDE_INLINE __m256i libdivide_s32_branchfree_do_vec256( + __m256i numers, const struct libdivide_s32_branchfree_t *denom); +static LIBDIVIDE_INLINE __m256i libdivide_u64_branchfree_do_vec256( + __m256i numers, const struct libdivide_u64_branchfree_t *denom); +static LIBDIVIDE_INLINE __m256i libdivide_s64_branchfree_do_vec256( + __m256i numers, const struct libdivide_s64_branchfree_t *denom); //////// Internal Utility Functions // Implementation of _mm256_srai_epi64(v, 63) (from AVX512). -static inline __m256i libdivide_s64_signbits(__m256i v) { +static LIBDIVIDE_INLINE __m256i libdivide_s64_signbits_vec256(__m256i v) { __m256i hiBitsDuped = _mm256_shuffle_epi32(v, _MM_SHUFFLE(3, 3, 1, 1)); __m256i signBits = _mm256_srai_epi32(hiBitsDuped, 31); return signBits; } // Implementation of _mm256_srai_epi64 (from AVX512). -static inline __m256i libdivide_s64_shift_right_vector(__m256i v, int amt) { +static LIBDIVIDE_INLINE __m256i libdivide_s64_shift_right_vec256(__m256i v, int amt) { const int b = 64 - amt; - __m256i m = _mm256_set1_epi64x(1ULL << (b - 1)); + __m256i m = _mm256_set1_epi64x((uint64_t)1 << (b - 1)); __m256i x = _mm256_srli_epi64(v, amt); __m256i result = _mm256_sub_epi64(_mm256_xor_si256(x, m), m); return result; } // Here, b is assumed to contain one 32-bit value repeated. -static inline __m256i libdivide_mullhi_u32_vector(__m256i a, __m256i b) { +static LIBDIVIDE_INLINE __m256i libdivide_mullhi_u32_vec256(__m256i a, __m256i b) { __m256i hi_product_0Z2Z = _mm256_srli_epi64(_mm256_mul_epu32(a, b), 32); __m256i a1X3X = _mm256_srli_epi64(a, 32); __m256i mask = _mm256_set_epi32(-1, 0, -1, 0, -1, 0, -1, 0); @@ -1513,7 +2323,7 @@ static inline __m256i libdivide_mullhi_u32_vector(__m256i a, __m256i b) { } // b is one 32-bit value repeated. -static inline __m256i libdivide_mullhi_s32_vector(__m256i a, __m256i b) { +static LIBDIVIDE_INLINE __m256i libdivide_mullhi_s32_vec256(__m256i a, __m256i b) { __m256i hi_product_0Z2Z = _mm256_srli_epi64(_mm256_mul_epi32(a, b), 32); __m256i a1X3X = _mm256_srli_epi64(a, 32); __m256i mask = _mm256_set_epi32(-1, 0, -1, 0, -1, 0, -1, 0); @@ -1522,164 +2332,241 @@ static inline __m256i libdivide_mullhi_s32_vector(__m256i a, __m256i b) { } // Here, y is assumed to contain one 64-bit value repeated. -// https://stackoverflow.com/a/28827013 -static inline __m256i libdivide_mullhi_u64_vector(__m256i x, __m256i y) { - __m256i lomask = _mm256_set1_epi64x(0xffffffff); - __m256i xh = _mm256_shuffle_epi32(x, 0xB1); // x0l, x0h, x1l, x1h - __m256i yh = _mm256_shuffle_epi32(y, 0xB1); // y0l, y0h, y1l, y1h - __m256i w0 = _mm256_mul_epu32(x, y); // x0l*y0l, x1l*y1l - __m256i w1 = _mm256_mul_epu32(x, yh); // x0l*y0h, x1l*y1h - __m256i w2 = _mm256_mul_epu32(xh, y); // x0h*y0l, x1h*y0l - __m256i w3 = _mm256_mul_epu32(xh, yh); // x0h*y0h, x1h*y1h - __m256i w0h = _mm256_srli_epi64(w0, 32); - __m256i s1 = _mm256_add_epi64(w1, w0h); - __m256i s1l = _mm256_and_si256(s1, lomask); - __m256i s1h = _mm256_srli_epi64(s1, 32); - __m256i s2 = _mm256_add_epi64(w2, s1l); - __m256i s2h = _mm256_srli_epi64(s2, 32); - __m256i hi = _mm256_add_epi64(w3, s1h); - hi = _mm256_add_epi64(hi, s2h); +static LIBDIVIDE_INLINE __m256i libdivide_mullhi_u64_vec256(__m256i x, __m256i y) { + // see m128i variant for comments. + __m256i x0y0 = _mm256_mul_epu32(x, y); + __m256i x0y0_hi = _mm256_srli_epi64(x0y0, 32); - return hi; + __m256i x1 = _mm256_shuffle_epi32(x, _MM_SHUFFLE(3, 3, 1, 1)); + __m256i y1 = _mm256_shuffle_epi32(y, _MM_SHUFFLE(3, 3, 1, 1)); + + __m256i x0y1 = _mm256_mul_epu32(x, y1); + __m256i x1y0 = _mm256_mul_epu32(x1, y); + __m256i x1y1 = _mm256_mul_epu32(x1, y1); + + __m256i mask = _mm256_set1_epi64x(0xFFFFFFFF); + __m256i temp = _mm256_add_epi64(x1y0, x0y0_hi); + __m256i temp_lo = _mm256_and_si256(temp, mask); + __m256i temp_hi = _mm256_srli_epi64(temp, 32); + + temp_lo = _mm256_srli_epi64(_mm256_add_epi64(temp_lo, x0y1), 32); + temp_hi = _mm256_add_epi64(x1y1, temp_hi); + return _mm256_add_epi64(temp_lo, temp_hi); } // y is one 64-bit value repeated. -static inline __m256i libdivide_mullhi_s64_vector(__m256i x, __m256i y) { - __m256i p = libdivide_mullhi_u64_vector(x, y); - __m256i t1 = _mm256_and_si256(libdivide_s64_signbits(x), y); - __m256i t2 = _mm256_and_si256(libdivide_s64_signbits(y), x); +static LIBDIVIDE_INLINE __m256i libdivide_mullhi_s64_vec256(__m256i x, __m256i y) { + __m256i p = libdivide_mullhi_u64_vec256(x, y); + __m256i t1 = _mm256_and_si256(libdivide_s64_signbits_vec256(x), y); + __m256i t2 = _mm256_and_si256(libdivide_s64_signbits_vec256(y), x); p = _mm256_sub_epi64(p, t1); p = _mm256_sub_epi64(p, t2); return p; } +////////// UINT16 + +__m256i libdivide_u16_do_vec256(__m256i numers, const struct libdivide_u16_t *denom) { + uint8_t more = denom->more; + if (!denom->magic) { + return _mm256_srli_epi16(numers, more); + } else { + __m256i q = _mm256_mulhi_epu16(numers, _mm256_set1_epi16(denom->magic)); + if (more & LIBDIVIDE_ADD_MARKER) { + __m256i t = _mm256_adds_epu16(_mm256_srli_epi16(_mm256_subs_epu16(numers, q), 1), q); + return _mm256_srli_epi16(t, (more & LIBDIVIDE_16_SHIFT_MASK)); + } else { + return _mm256_srli_epi16(q, more); + } + } +} + +__m256i libdivide_u16_branchfree_do_vec256( + __m256i numers, const struct libdivide_u16_branchfree_t *denom) { + __m256i q = _mm256_mulhi_epu16(numers, _mm256_set1_epi16(denom->magic)); + __m256i t = _mm256_adds_epu16(_mm256_srli_epi16(_mm256_subs_epu16(numers, q), 1), q); + return _mm256_srli_epi16(t, denom->more); +} + ////////// UINT32 -__m256i libdivide_u32_do_vector(__m256i numers, const struct libdivide_u32_t *denom) { +__m256i libdivide_u32_do_vec256(__m256i numers, const struct libdivide_u32_t *denom) { uint8_t more = denom->more; if (!denom->magic) { return _mm256_srli_epi32(numers, more); - } - else { - __m256i q = libdivide_mullhi_u32_vector(numers, _mm256_set1_epi32(denom->magic)); + } else { + __m256i q = libdivide_mullhi_u32_vec256(numers, _mm256_set1_epi32(denom->magic)); if (more & LIBDIVIDE_ADD_MARKER) { // uint32_t t = ((numer - q) >> 1) + q; // return t >> denom->shift; uint32_t shift = more & LIBDIVIDE_32_SHIFT_MASK; __m256i t = _mm256_add_epi32(_mm256_srli_epi32(_mm256_sub_epi32(numers, q), 1), q); return _mm256_srli_epi32(t, shift); - } - else { + } else { return _mm256_srli_epi32(q, more); } } } -__m256i libdivide_u32_branchfree_do_vector(__m256i numers, const struct libdivide_u32_branchfree_t *denom) { - __m256i q = libdivide_mullhi_u32_vector(numers, _mm256_set1_epi32(denom->magic)); +__m256i libdivide_u32_branchfree_do_vec256( + __m256i numers, const struct libdivide_u32_branchfree_t *denom) { + __m256i q = libdivide_mullhi_u32_vec256(numers, _mm256_set1_epi32(denom->magic)); __m256i t = _mm256_add_epi32(_mm256_srli_epi32(_mm256_sub_epi32(numers, q), 1), q); return _mm256_srli_epi32(t, denom->more); } ////////// UINT64 -__m256i libdivide_u64_do_vector(__m256i numers, const struct libdivide_u64_t *denom) { +__m256i libdivide_u64_do_vec256(__m256i numers, const struct libdivide_u64_t *denom) { uint8_t more = denom->more; if (!denom->magic) { return _mm256_srli_epi64(numers, more); - } - else { - __m256i q = libdivide_mullhi_u64_vector(numers, _mm256_set1_epi64x(denom->magic)); + } else { + __m256i q = libdivide_mullhi_u64_vec256(numers, _mm256_set1_epi64x(denom->magic)); if (more & LIBDIVIDE_ADD_MARKER) { // uint32_t t = ((numer - q) >> 1) + q; // return t >> denom->shift; uint32_t shift = more & LIBDIVIDE_64_SHIFT_MASK; __m256i t = _mm256_add_epi64(_mm256_srli_epi64(_mm256_sub_epi64(numers, q), 1), q); return _mm256_srli_epi64(t, shift); - } - else { + } else { return _mm256_srli_epi64(q, more); } } } -__m256i libdivide_u64_branchfree_do_vector(__m256i numers, const struct libdivide_u64_branchfree_t *denom) { - __m256i q = libdivide_mullhi_u64_vector(numers, _mm256_set1_epi64x(denom->magic)); +__m256i libdivide_u64_branchfree_do_vec256( + __m256i numers, const struct libdivide_u64_branchfree_t *denom) { + __m256i q = libdivide_mullhi_u64_vec256(numers, _mm256_set1_epi64x(denom->magic)); __m256i t = _mm256_add_epi64(_mm256_srli_epi64(_mm256_sub_epi64(numers, q), 1), q); return _mm256_srli_epi64(t, denom->more); } +////////// SINT16 + +__m256i libdivide_s16_do_vec256(__m256i numers, const struct libdivide_s16_t *denom) { + uint8_t more = denom->more; + if (!denom->magic) { + uint16_t shift = more & LIBDIVIDE_16_SHIFT_MASK; + uint16_t mask = ((uint16_t)1 << shift) - 1; + __m256i roundToZeroTweak = _mm256_set1_epi16(mask); + // q = numer + ((numer >> 15) & roundToZeroTweak); + __m256i q = _mm256_add_epi16( + numers, _mm256_and_si256(_mm256_srai_epi16(numers, 15), roundToZeroTweak)); + q = _mm256_srai_epi16(q, shift); + __m256i sign = _mm256_set1_epi16((int8_t)more >> 7); + // q = (q ^ sign) - sign; + q = _mm256_sub_epi16(_mm256_xor_si256(q, sign), sign); + return q; + } else { + __m256i q = _mm256_mulhi_epi16(numers, _mm256_set1_epi16(denom->magic)); + if (more & LIBDIVIDE_ADD_MARKER) { + // must be arithmetic shift + __m256i sign = _mm256_set1_epi16((int8_t)more >> 7); + // q += ((numer ^ sign) - sign); + q = _mm256_add_epi16(q, _mm256_sub_epi16(_mm256_xor_si256(numers, sign), sign)); + } + // q >>= shift + q = _mm256_srai_epi16(q, more & LIBDIVIDE_16_SHIFT_MASK); + q = _mm256_add_epi16(q, _mm256_srli_epi16(q, 15)); // q += (q < 0) + return q; + } +} + +__m256i libdivide_s16_branchfree_do_vec256( + __m256i numers, const struct libdivide_s16_branchfree_t *denom) { + int16_t magic = denom->magic; + uint8_t more = denom->more; + uint8_t shift = more & LIBDIVIDE_16_SHIFT_MASK; + // must be arithmetic shift + __m256i sign = _mm256_set1_epi16((int8_t)more >> 7); + __m256i q = _mm256_mulhi_epi16(numers, _mm256_set1_epi16(magic)); + q = _mm256_add_epi16(q, numers); // q += numers + + // If q is non-negative, we have nothing to do + // If q is negative, we want to add either (2**shift)-1 if d is + // a power of 2, or (2**shift) if it is not a power of 2 + uint16_t is_power_of_2 = (magic == 0); + __m256i q_sign = _mm256_srai_epi16(q, 15); // q_sign = q >> 15 + __m256i mask = _mm256_set1_epi16(((uint16_t)1 << shift) - is_power_of_2); + q = _mm256_add_epi16(q, _mm256_and_si256(q_sign, mask)); // q = q + (q_sign & mask) + q = _mm256_srai_epi16(q, shift); // q >>= shift + q = _mm256_sub_epi16(_mm256_xor_si256(q, sign), sign); // q = (q ^ sign) - sign + return q; +} + ////////// SINT32 -__m256i libdivide_s32_do_vector(__m256i numers, const struct libdivide_s32_t *denom) { +__m256i libdivide_s32_do_vec256(__m256i numers, const struct libdivide_s32_t *denom) { uint8_t more = denom->more; if (!denom->magic) { uint32_t shift = more & LIBDIVIDE_32_SHIFT_MASK; - uint32_t mask = (1U << shift) - 1; + uint32_t mask = ((uint32_t)1 << shift) - 1; __m256i roundToZeroTweak = _mm256_set1_epi32(mask); // q = numer + ((numer >> 31) & roundToZeroTweak); - __m256i q = _mm256_add_epi32(numers, _mm256_and_si256(_mm256_srai_epi32(numers, 31), roundToZeroTweak)); + __m256i q = _mm256_add_epi32( + numers, _mm256_and_si256(_mm256_srai_epi32(numers, 31), roundToZeroTweak)); q = _mm256_srai_epi32(q, shift); __m256i sign = _mm256_set1_epi32((int8_t)more >> 7); // q = (q ^ sign) - sign; q = _mm256_sub_epi32(_mm256_xor_si256(q, sign), sign); return q; - } - else { - __m256i q = libdivide_mullhi_s32_vector(numers, _mm256_set1_epi32(denom->magic)); + } else { + __m256i q = libdivide_mullhi_s32_vec256(numers, _mm256_set1_epi32(denom->magic)); if (more & LIBDIVIDE_ADD_MARKER) { - // must be arithmetic shift + // must be arithmetic shift __m256i sign = _mm256_set1_epi32((int8_t)more >> 7); - // q += ((numer ^ sign) - sign); + // q += ((numer ^ sign) - sign); q = _mm256_add_epi32(q, _mm256_sub_epi32(_mm256_xor_si256(numers, sign), sign)); } // q >>= shift q = _mm256_srai_epi32(q, more & LIBDIVIDE_32_SHIFT_MASK); - q = _mm256_add_epi32(q, _mm256_srli_epi32(q, 31)); // q += (q < 0) + q = _mm256_add_epi32(q, _mm256_srli_epi32(q, 31)); // q += (q < 0) return q; } } -__m256i libdivide_s32_branchfree_do_vector(__m256i numers, const struct libdivide_s32_branchfree_t *denom) { +__m256i libdivide_s32_branchfree_do_vec256( + __m256i numers, const struct libdivide_s32_branchfree_t *denom) { int32_t magic = denom->magic; uint8_t more = denom->more; uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK; - // must be arithmetic shift + // must be arithmetic shift __m256i sign = _mm256_set1_epi32((int8_t)more >> 7); - __m256i q = libdivide_mullhi_s32_vector(numers, _mm256_set1_epi32(magic)); - q = _mm256_add_epi32(q, numers); // q += numers + __m256i q = libdivide_mullhi_s32_vec256(numers, _mm256_set1_epi32(magic)); + q = _mm256_add_epi32(q, numers); // q += numers // If q is non-negative, we have nothing to do // If q is negative, we want to add either (2**shift)-1 if d is // a power of 2, or (2**shift) if it is not a power of 2 uint32_t is_power_of_2 = (magic == 0); - __m256i q_sign = _mm256_srai_epi32(q, 31); // q_sign = q >> 31 - __m256i mask = _mm256_set1_epi32((1U << shift) - is_power_of_2); - q = _mm256_add_epi32(q, _mm256_and_si256(q_sign, mask)); // q = q + (q_sign & mask) - q = _mm256_srai_epi32(q, shift); // q >>= shift - q = _mm256_sub_epi32(_mm256_xor_si256(q, sign), sign); // q = (q ^ sign) - sign + __m256i q_sign = _mm256_srai_epi32(q, 31); // q_sign = q >> 31 + __m256i mask = _mm256_set1_epi32(((uint32_t)1 << shift) - is_power_of_2); + q = _mm256_add_epi32(q, _mm256_and_si256(q_sign, mask)); // q = q + (q_sign & mask) + q = _mm256_srai_epi32(q, shift); // q >>= shift + q = _mm256_sub_epi32(_mm256_xor_si256(q, sign), sign); // q = (q ^ sign) - sign return q; } ////////// SINT64 -__m256i libdivide_s64_do_vector(__m256i numers, const struct libdivide_s64_t *denom) { +__m256i libdivide_s64_do_vec256(__m256i numers, const struct libdivide_s64_t *denom) { uint8_t more = denom->more; int64_t magic = denom->magic; - if (magic == 0) { // shift path + if (magic == 0) { // shift path uint32_t shift = more & LIBDIVIDE_64_SHIFT_MASK; - uint64_t mask = (1ULL << shift) - 1; + uint64_t mask = ((uint64_t)1 << shift) - 1; __m256i roundToZeroTweak = _mm256_set1_epi64x(mask); // q = numer + ((numer >> 63) & roundToZeroTweak); - __m256i q = _mm256_add_epi64(numers, _mm256_and_si256(libdivide_s64_signbits(numers), roundToZeroTweak)); - q = libdivide_s64_shift_right_vector(q, shift); + __m256i q = _mm256_add_epi64( + numers, _mm256_and_si256(libdivide_s64_signbits_vec256(numers), roundToZeroTweak)); + q = libdivide_s64_shift_right_vec256(q, shift); __m256i sign = _mm256_set1_epi32((int8_t)more >> 7); - // q = (q ^ sign) - sign; + // q = (q ^ sign) - sign; q = _mm256_sub_epi64(_mm256_xor_si256(q, sign), sign); return q; - } - else { - __m256i q = libdivide_mullhi_s64_vector(numers, _mm256_set1_epi64x(magic)); + } else { + __m256i q = libdivide_mullhi_s64_vec256(numers, _mm256_set1_epi64x(magic)); if (more & LIBDIVIDE_ADD_MARKER) { // must be arithmetic shift __m256i sign = _mm256_set1_epi32((int8_t)more >> 7); @@ -1687,67 +2574,86 @@ __m256i libdivide_s64_do_vector(__m256i numers, const struct libdivide_s64_t *de q = _mm256_add_epi64(q, _mm256_sub_epi64(_mm256_xor_si256(numers, sign), sign)); } // q >>= denom->mult_path.shift - q = libdivide_s64_shift_right_vector(q, more & LIBDIVIDE_64_SHIFT_MASK); - q = _mm256_add_epi64(q, _mm256_srli_epi64(q, 63)); // q += (q < 0) + q = libdivide_s64_shift_right_vec256(q, more & LIBDIVIDE_64_SHIFT_MASK); + q = _mm256_add_epi64(q, _mm256_srli_epi64(q, 63)); // q += (q < 0) return q; } } -__m256i libdivide_s64_branchfree_do_vector(__m256i numers, const struct libdivide_s64_branchfree_t *denom) { +__m256i libdivide_s64_branchfree_do_vec256( + __m256i numers, const struct libdivide_s64_branchfree_t *denom) { int64_t magic = denom->magic; uint8_t more = denom->more; uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK; // must be arithmetic shift __m256i sign = _mm256_set1_epi32((int8_t)more >> 7); - // libdivide_mullhi_s64(numers, magic); - __m256i q = libdivide_mullhi_s64_vector(numers, _mm256_set1_epi64x(magic)); - q = _mm256_add_epi64(q, numers); // q += numers + // libdivide_mullhi_s64(numers, magic); + __m256i q = libdivide_mullhi_s64_vec256(numers, _mm256_set1_epi64x(magic)); + q = _mm256_add_epi64(q, numers); // q += numers // If q is non-negative, we have nothing to do. // If q is negative, we want to add either (2**shift)-1 if d is // a power of 2, or (2**shift) if it is not a power of 2. uint32_t is_power_of_2 = (magic == 0); - __m256i q_sign = libdivide_s64_signbits(q); // q_sign = q >> 63 - __m256i mask = _mm256_set1_epi64x((1ULL << shift) - is_power_of_2); - q = _mm256_add_epi64(q, _mm256_and_si256(q_sign, mask)); // q = q + (q_sign & mask) - q = libdivide_s64_shift_right_vector(q, shift); // q >>= shift - q = _mm256_sub_epi64(_mm256_xor_si256(q, sign), sign); // q = (q ^ sign) - sign + __m256i q_sign = libdivide_s64_signbits_vec256(q); // q_sign = q >> 63 + __m256i mask = _mm256_set1_epi64x(((uint64_t)1 << shift) - is_power_of_2); + q = _mm256_add_epi64(q, _mm256_and_si256(q_sign, mask)); // q = q + (q_sign & mask) + q = libdivide_s64_shift_right_vec256(q, shift); // q >>= shift + q = _mm256_sub_epi64(_mm256_xor_si256(q, sign), sign); // q = (q ^ sign) - sign return q; } -#elif defined(LIBDIVIDE_SSE2) +#endif -static inline __m128i libdivide_u32_do_vector(__m128i numers, const struct libdivide_u32_t *denom); -static inline __m128i libdivide_s32_do_vector(__m128i numers, const struct libdivide_s32_t *denom); -static inline __m128i libdivide_u64_do_vector(__m128i numers, const struct libdivide_u64_t *denom); -static inline __m128i libdivide_s64_do_vector(__m128i numers, const struct libdivide_s64_t *denom); +#if defined(LIBDIVIDE_SSE2) -static inline __m128i libdivide_u32_branchfree_do_vector(__m128i numers, const struct libdivide_u32_branchfree_t *denom); -static inline __m128i libdivide_s32_branchfree_do_vector(__m128i numers, const struct libdivide_s32_branchfree_t *denom); -static inline __m128i libdivide_u64_branchfree_do_vector(__m128i numers, const struct libdivide_u64_branchfree_t *denom); -static inline __m128i libdivide_s64_branchfree_do_vector(__m128i numers, const struct libdivide_s64_branchfree_t *denom); +static LIBDIVIDE_INLINE __m128i libdivide_u16_do_vec128( + __m128i numers, const struct libdivide_u16_t *denom); +static LIBDIVIDE_INLINE __m128i libdivide_s16_do_vec128( + __m128i numers, const struct libdivide_s16_t *denom); +static LIBDIVIDE_INLINE __m128i libdivide_u32_do_vec128( + __m128i numers, const struct libdivide_u32_t *denom); +static LIBDIVIDE_INLINE __m128i libdivide_s32_do_vec128( + __m128i numers, const struct libdivide_s32_t *denom); +static LIBDIVIDE_INLINE __m128i libdivide_u64_do_vec128( + __m128i numers, const struct libdivide_u64_t *denom); +static LIBDIVIDE_INLINE __m128i libdivide_s64_do_vec128( + __m128i numers, const struct libdivide_s64_t *denom); + +static LIBDIVIDE_INLINE __m128i libdivide_u16_branchfree_do_vec128( + __m128i numers, const struct libdivide_u16_branchfree_t *denom); +static LIBDIVIDE_INLINE __m128i libdivide_s16_branchfree_do_vec128( + __m128i numers, const struct libdivide_s16_branchfree_t *denom); +static LIBDIVIDE_INLINE __m128i libdivide_u32_branchfree_do_vec128( + __m128i numers, const struct libdivide_u32_branchfree_t *denom); +static LIBDIVIDE_INLINE __m128i libdivide_s32_branchfree_do_vec128( + __m128i numers, const struct libdivide_s32_branchfree_t *denom); +static LIBDIVIDE_INLINE __m128i libdivide_u64_branchfree_do_vec128( + __m128i numers, const struct libdivide_u64_branchfree_t *denom); +static LIBDIVIDE_INLINE __m128i libdivide_s64_branchfree_do_vec128( + __m128i numers, const struct libdivide_s64_branchfree_t *denom); //////// Internal Utility Functions // Implementation of _mm_srai_epi64(v, 63) (from AVX512). -static inline __m128i libdivide_s64_signbits(__m128i v) { +static LIBDIVIDE_INLINE __m128i libdivide_s64_signbits_vec128(__m128i v) { __m128i hiBitsDuped = _mm_shuffle_epi32(v, _MM_SHUFFLE(3, 3, 1, 1)); __m128i signBits = _mm_srai_epi32(hiBitsDuped, 31); return signBits; } // Implementation of _mm_srai_epi64 (from AVX512). -static inline __m128i libdivide_s64_shift_right_vector(__m128i v, int amt) { +static LIBDIVIDE_INLINE __m128i libdivide_s64_shift_right_vec128(__m128i v, int amt) { const int b = 64 - amt; - __m128i m = _mm_set1_epi64x(1ULL << (b - 1)); + __m128i m = _mm_set1_epi64x((uint64_t)1 << (b - 1)); __m128i x = _mm_srli_epi64(v, amt); __m128i result = _mm_sub_epi64(_mm_xor_si128(x, m), m); return result; } // Here, b is assumed to contain one 32-bit value repeated. -static inline __m128i libdivide_mullhi_u32_vector(__m128i a, __m128i b) { +static LIBDIVIDE_INLINE __m128i libdivide_mullhi_u32_vec128(__m128i a, __m128i b) { __m128i hi_product_0Z2Z = _mm_srli_epi64(_mm_mul_epu32(a, b), 32); __m128i a1X3X = _mm_srli_epi64(a, 32); __m128i mask = _mm_set_epi32(-1, 0, -1, 0); @@ -1758,8 +2664,8 @@ static inline __m128i libdivide_mullhi_u32_vector(__m128i a, __m128i b) { // SSE2 does not have a signed multiplication instruction, but we can convert // unsigned to signed pretty efficiently. Again, b is just a 32 bit value // repeated four times. -static inline __m128i libdivide_mullhi_s32_vector(__m128i a, __m128i b) { - __m128i p = libdivide_mullhi_u32_vector(a, b); +static LIBDIVIDE_INLINE __m128i libdivide_mullhi_s32_vec128(__m128i a, __m128i b) { + __m128i p = libdivide_mullhi_u32_vec128(a, b); // t1 = (a >> 31) & y, arithmetic shift __m128i t1 = _mm_and_si128(_mm_srai_epi32(a, 31), b); __m128i t2 = _mm_and_si128(_mm_srai_epi32(b, 31), a); @@ -1769,164 +2675,251 @@ static inline __m128i libdivide_mullhi_s32_vector(__m128i a, __m128i b) { } // Here, y is assumed to contain one 64-bit value repeated. -// https://stackoverflow.com/a/28827013 -static inline __m128i libdivide_mullhi_u64_vector(__m128i x, __m128i y) { - __m128i lomask = _mm_set1_epi64x(0xffffffff); - __m128i xh = _mm_shuffle_epi32(x, 0xB1); // x0l, x0h, x1l, x1h - __m128i yh = _mm_shuffle_epi32(y, 0xB1); // y0l, y0h, y1l, y1h - __m128i w0 = _mm_mul_epu32(x, y); // x0l*y0l, x1l*y1l - __m128i w1 = _mm_mul_epu32(x, yh); // x0l*y0h, x1l*y1h - __m128i w2 = _mm_mul_epu32(xh, y); // x0h*y0l, x1h*y0l - __m128i w3 = _mm_mul_epu32(xh, yh); // x0h*y0h, x1h*y1h - __m128i w0h = _mm_srli_epi64(w0, 32); - __m128i s1 = _mm_add_epi64(w1, w0h); - __m128i s1l = _mm_and_si128(s1, lomask); - __m128i s1h = _mm_srli_epi64(s1, 32); - __m128i s2 = _mm_add_epi64(w2, s1l); - __m128i s2h = _mm_srli_epi64(s2, 32); - __m128i hi = _mm_add_epi64(w3, s1h); - hi = _mm_add_epi64(hi, s2h); +static LIBDIVIDE_INLINE __m128i libdivide_mullhi_u64_vec128(__m128i x, __m128i y) { + // full 128 bits product is: + // x0*y0 + (x0*y1 << 32) + (x1*y0 << 32) + (x1*y1 << 64) + // Note x0,y0,x1,y1 are all conceptually uint32, products are 32x32->64. - return hi; + // Compute x0*y0. + // Note x1, y1 are ignored by mul_epu32. + __m128i x0y0 = _mm_mul_epu32(x, y); + __m128i x0y0_hi = _mm_srli_epi64(x0y0, 32); + + // Get x1, y1 in the low bits. + // We could shuffle or right shift. Shuffles are preferred as they preserve + // the source register for the next computation. + __m128i x1 = _mm_shuffle_epi32(x, _MM_SHUFFLE(3, 3, 1, 1)); + __m128i y1 = _mm_shuffle_epi32(y, _MM_SHUFFLE(3, 3, 1, 1)); + + // No need to mask off top 32 bits for mul_epu32. + __m128i x0y1 = _mm_mul_epu32(x, y1); + __m128i x1y0 = _mm_mul_epu32(x1, y); + __m128i x1y1 = _mm_mul_epu32(x1, y1); + + // Mask here selects low bits only. + __m128i mask = _mm_set1_epi64x(0xFFFFFFFF); + __m128i temp = _mm_add_epi64(x1y0, x0y0_hi); + __m128i temp_lo = _mm_and_si128(temp, mask); + __m128i temp_hi = _mm_srli_epi64(temp, 32); + + temp_lo = _mm_srli_epi64(_mm_add_epi64(temp_lo, x0y1), 32); + temp_hi = _mm_add_epi64(x1y1, temp_hi); + return _mm_add_epi64(temp_lo, temp_hi); } // y is one 64-bit value repeated. -static inline __m128i libdivide_mullhi_s64_vector(__m128i x, __m128i y) { - __m128i p = libdivide_mullhi_u64_vector(x, y); - __m128i t1 = _mm_and_si128(libdivide_s64_signbits(x), y); - __m128i t2 = _mm_and_si128(libdivide_s64_signbits(y), x); +static LIBDIVIDE_INLINE __m128i libdivide_mullhi_s64_vec128(__m128i x, __m128i y) { + __m128i p = libdivide_mullhi_u64_vec128(x, y); + __m128i t1 = _mm_and_si128(libdivide_s64_signbits_vec128(x), y); + __m128i t2 = _mm_and_si128(libdivide_s64_signbits_vec128(y), x); p = _mm_sub_epi64(p, t1); p = _mm_sub_epi64(p, t2); return p; } +////////// UINT26 + +__m128i libdivide_u16_do_vec128(__m128i numers, const struct libdivide_u16_t *denom) { + uint8_t more = denom->more; + if (!denom->magic) { + return _mm_srli_epi16(numers, more); + } else { + __m128i q = _mm_mulhi_epu16(numers, _mm_set1_epi16(denom->magic)); + if (more & LIBDIVIDE_ADD_MARKER) { + __m128i t = _mm_adds_epu16(_mm_srli_epi16(_mm_subs_epu16(numers, q), 1), q); + return _mm_srli_epi16(t, (more & LIBDIVIDE_16_SHIFT_MASK)); + } else { + return _mm_srli_epi16(q, more); + } + } +} + +__m128i libdivide_u16_branchfree_do_vec128( + __m128i numers, const struct libdivide_u16_branchfree_t *denom) { + __m128i q = _mm_mulhi_epu16(numers, _mm_set1_epi16(denom->magic)); + __m128i t = _mm_adds_epu16(_mm_srli_epi16(_mm_subs_epu16(numers, q), 1), q); + return _mm_srli_epi16(t, denom->more); +} + ////////// UINT32 -__m128i libdivide_u32_do_vector(__m128i numers, const struct libdivide_u32_t *denom) { +__m128i libdivide_u32_do_vec128(__m128i numers, const struct libdivide_u32_t *denom) { uint8_t more = denom->more; if (!denom->magic) { return _mm_srli_epi32(numers, more); - } - else { - __m128i q = libdivide_mullhi_u32_vector(numers, _mm_set1_epi32(denom->magic)); + } else { + __m128i q = libdivide_mullhi_u32_vec128(numers, _mm_set1_epi32(denom->magic)); if (more & LIBDIVIDE_ADD_MARKER) { // uint32_t t = ((numer - q) >> 1) + q; // return t >> denom->shift; uint32_t shift = more & LIBDIVIDE_32_SHIFT_MASK; __m128i t = _mm_add_epi32(_mm_srli_epi32(_mm_sub_epi32(numers, q), 1), q); return _mm_srli_epi32(t, shift); - } - else { + } else { return _mm_srli_epi32(q, more); } } } -__m128i libdivide_u32_branchfree_do_vector(__m128i numers, const struct libdivide_u32_branchfree_t *denom) { - __m128i q = libdivide_mullhi_u32_vector(numers, _mm_set1_epi32(denom->magic)); +__m128i libdivide_u32_branchfree_do_vec128( + __m128i numers, const struct libdivide_u32_branchfree_t *denom) { + __m128i q = libdivide_mullhi_u32_vec128(numers, _mm_set1_epi32(denom->magic)); __m128i t = _mm_add_epi32(_mm_srli_epi32(_mm_sub_epi32(numers, q), 1), q); return _mm_srli_epi32(t, denom->more); } ////////// UINT64 -__m128i libdivide_u64_do_vector(__m128i numers, const struct libdivide_u64_t *denom) { +__m128i libdivide_u64_do_vec128(__m128i numers, const struct libdivide_u64_t *denom) { uint8_t more = denom->more; if (!denom->magic) { return _mm_srli_epi64(numers, more); - } - else { - __m128i q = libdivide_mullhi_u64_vector(numers, _mm_set1_epi64x(denom->magic)); + } else { + __m128i q = libdivide_mullhi_u64_vec128(numers, _mm_set1_epi64x(denom->magic)); if (more & LIBDIVIDE_ADD_MARKER) { // uint32_t t = ((numer - q) >> 1) + q; // return t >> denom->shift; uint32_t shift = more & LIBDIVIDE_64_SHIFT_MASK; __m128i t = _mm_add_epi64(_mm_srli_epi64(_mm_sub_epi64(numers, q), 1), q); return _mm_srli_epi64(t, shift); - } - else { + } else { return _mm_srli_epi64(q, more); } } } -__m128i libdivide_u64_branchfree_do_vector(__m128i numers, const struct libdivide_u64_branchfree_t *denom) { - __m128i q = libdivide_mullhi_u64_vector(numers, _mm_set1_epi64x(denom->magic)); +__m128i libdivide_u64_branchfree_do_vec128( + __m128i numers, const struct libdivide_u64_branchfree_t *denom) { + __m128i q = libdivide_mullhi_u64_vec128(numers, _mm_set1_epi64x(denom->magic)); __m128i t = _mm_add_epi64(_mm_srli_epi64(_mm_sub_epi64(numers, q), 1), q); return _mm_srli_epi64(t, denom->more); } +////////// SINT16 + +__m128i libdivide_s16_do_vec128(__m128i numers, const struct libdivide_s16_t *denom) { + uint8_t more = denom->more; + if (!denom->magic) { + uint16_t shift = more & LIBDIVIDE_16_SHIFT_MASK; + uint16_t mask = ((uint16_t)1 << shift) - 1; + __m128i roundToZeroTweak = _mm_set1_epi16(mask); + // q = numer + ((numer >> 15) & roundToZeroTweak); + __m128i q = + _mm_add_epi16(numers, _mm_and_si128(_mm_srai_epi16(numers, 15), roundToZeroTweak)); + q = _mm_srai_epi16(q, shift); + __m128i sign = _mm_set1_epi16((int8_t)more >> 7); + // q = (q ^ sign) - sign; + q = _mm_sub_epi16(_mm_xor_si128(q, sign), sign); + return q; + } else { + __m128i q = _mm_mulhi_epi16(numers, _mm_set1_epi16(denom->magic)); + if (more & LIBDIVIDE_ADD_MARKER) { + // must be arithmetic shift + __m128i sign = _mm_set1_epi16((int8_t)more >> 7); + // q += ((numer ^ sign) - sign); + q = _mm_add_epi16(q, _mm_sub_epi16(_mm_xor_si128(numers, sign), sign)); + } + // q >>= shift + q = _mm_srai_epi16(q, more & LIBDIVIDE_16_SHIFT_MASK); + q = _mm_add_epi16(q, _mm_srli_epi16(q, 15)); // q += (q < 0) + return q; + } +} + +__m128i libdivide_s16_branchfree_do_vec128( + __m128i numers, const struct libdivide_s16_branchfree_t *denom) { + int16_t magic = denom->magic; + uint8_t more = denom->more; + uint8_t shift = more & LIBDIVIDE_16_SHIFT_MASK; + // must be arithmetic shift + __m128i sign = _mm_set1_epi16((int8_t)more >> 7); + __m128i q = _mm_mulhi_epi16(numers, _mm_set1_epi16(magic)); + q = _mm_add_epi16(q, numers); // q += numers + + // If q is non-negative, we have nothing to do + // If q is negative, we want to add either (2**shift)-1 if d is + // a power of 2, or (2**shift) if it is not a power of 2 + uint16_t is_power_of_2 = (magic == 0); + __m128i q_sign = _mm_srai_epi16(q, 15); // q_sign = q >> 15 + __m128i mask = _mm_set1_epi16(((uint16_t)1 << shift) - is_power_of_2); + q = _mm_add_epi16(q, _mm_and_si128(q_sign, mask)); // q = q + (q_sign & mask) + q = _mm_srai_epi16(q, shift); // q >>= shift + q = _mm_sub_epi16(_mm_xor_si128(q, sign), sign); // q = (q ^ sign) - sign + return q; +} + ////////// SINT32 -__m128i libdivide_s32_do_vector(__m128i numers, const struct libdivide_s32_t *denom) { +__m128i libdivide_s32_do_vec128(__m128i numers, const struct libdivide_s32_t *denom) { uint8_t more = denom->more; if (!denom->magic) { uint32_t shift = more & LIBDIVIDE_32_SHIFT_MASK; - uint32_t mask = (1U << shift) - 1; + uint32_t mask = ((uint32_t)1 << shift) - 1; __m128i roundToZeroTweak = _mm_set1_epi32(mask); // q = numer + ((numer >> 31) & roundToZeroTweak); - __m128i q = _mm_add_epi32(numers, _mm_and_si128(_mm_srai_epi32(numers, 31), roundToZeroTweak)); + __m128i q = + _mm_add_epi32(numers, _mm_and_si128(_mm_srai_epi32(numers, 31), roundToZeroTweak)); q = _mm_srai_epi32(q, shift); __m128i sign = _mm_set1_epi32((int8_t)more >> 7); // q = (q ^ sign) - sign; q = _mm_sub_epi32(_mm_xor_si128(q, sign), sign); return q; - } - else { - __m128i q = libdivide_mullhi_s32_vector(numers, _mm_set1_epi32(denom->magic)); + } else { + __m128i q = libdivide_mullhi_s32_vec128(numers, _mm_set1_epi32(denom->magic)); if (more & LIBDIVIDE_ADD_MARKER) { - // must be arithmetic shift + // must be arithmetic shift __m128i sign = _mm_set1_epi32((int8_t)more >> 7); - // q += ((numer ^ sign) - sign); + // q += ((numer ^ sign) - sign); q = _mm_add_epi32(q, _mm_sub_epi32(_mm_xor_si128(numers, sign), sign)); } // q >>= shift q = _mm_srai_epi32(q, more & LIBDIVIDE_32_SHIFT_MASK); - q = _mm_add_epi32(q, _mm_srli_epi32(q, 31)); // q += (q < 0) + q = _mm_add_epi32(q, _mm_srli_epi32(q, 31)); // q += (q < 0) return q; } } -__m128i libdivide_s32_branchfree_do_vector(__m128i numers, const struct libdivide_s32_branchfree_t *denom) { +__m128i libdivide_s32_branchfree_do_vec128( + __m128i numers, const struct libdivide_s32_branchfree_t *denom) { int32_t magic = denom->magic; uint8_t more = denom->more; uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK; - // must be arithmetic shift + // must be arithmetic shift __m128i sign = _mm_set1_epi32((int8_t)more >> 7); - __m128i q = libdivide_mullhi_s32_vector(numers, _mm_set1_epi32(magic)); - q = _mm_add_epi32(q, numers); // q += numers + __m128i q = libdivide_mullhi_s32_vec128(numers, _mm_set1_epi32(magic)); + q = _mm_add_epi32(q, numers); // q += numers // If q is non-negative, we have nothing to do // If q is negative, we want to add either (2**shift)-1 if d is // a power of 2, or (2**shift) if it is not a power of 2 uint32_t is_power_of_2 = (magic == 0); - __m128i q_sign = _mm_srai_epi32(q, 31); // q_sign = q >> 31 - __m128i mask = _mm_set1_epi32((1U << shift) - is_power_of_2); - q = _mm_add_epi32(q, _mm_and_si128(q_sign, mask)); // q = q + (q_sign & mask) - q = _mm_srai_epi32(q, shift); // q >>= shift - q = _mm_sub_epi32(_mm_xor_si128(q, sign), sign); // q = (q ^ sign) - sign + __m128i q_sign = _mm_srai_epi32(q, 31); // q_sign = q >> 31 + __m128i mask = _mm_set1_epi32(((uint32_t)1 << shift) - is_power_of_2); + q = _mm_add_epi32(q, _mm_and_si128(q_sign, mask)); // q = q + (q_sign & mask) + q = _mm_srai_epi32(q, shift); // q >>= shift + q = _mm_sub_epi32(_mm_xor_si128(q, sign), sign); // q = (q ^ sign) - sign return q; } ////////// SINT64 -__m128i libdivide_s64_do_vector(__m128i numers, const struct libdivide_s64_t *denom) { +__m128i libdivide_s64_do_vec128(__m128i numers, const struct libdivide_s64_t *denom) { uint8_t more = denom->more; int64_t magic = denom->magic; - if (magic == 0) { // shift path + if (magic == 0) { // shift path uint32_t shift = more & LIBDIVIDE_64_SHIFT_MASK; - uint64_t mask = (1ULL << shift) - 1; + uint64_t mask = ((uint64_t)1 << shift) - 1; __m128i roundToZeroTweak = _mm_set1_epi64x(mask); // q = numer + ((numer >> 63) & roundToZeroTweak); - __m128i q = _mm_add_epi64(numers, _mm_and_si128(libdivide_s64_signbits(numers), roundToZeroTweak)); - q = libdivide_s64_shift_right_vector(q, shift); + __m128i q = _mm_add_epi64( + numers, _mm_and_si128(libdivide_s64_signbits_vec128(numers), roundToZeroTweak)); + q = libdivide_s64_shift_right_vec128(q, shift); __m128i sign = _mm_set1_epi32((int8_t)more >> 7); - // q = (q ^ sign) - sign; + // q = (q ^ sign) - sign; q = _mm_sub_epi64(_mm_xor_si128(q, sign), sign); return q; - } - else { - __m128i q = libdivide_mullhi_s64_vector(numers, _mm_set1_epi64x(magic)); + } else { + __m128i q = libdivide_mullhi_s64_vec128(numers, _mm_set1_epi64x(magic)); if (more & LIBDIVIDE_ADD_MARKER) { // must be arithmetic shift __m128i sign = _mm_set1_epi32((int8_t)more >> 7); @@ -1934,32 +2927,33 @@ __m128i libdivide_s64_do_vector(__m128i numers, const struct libdivide_s64_t *de q = _mm_add_epi64(q, _mm_sub_epi64(_mm_xor_si128(numers, sign), sign)); } // q >>= denom->mult_path.shift - q = libdivide_s64_shift_right_vector(q, more & LIBDIVIDE_64_SHIFT_MASK); - q = _mm_add_epi64(q, _mm_srli_epi64(q, 63)); // q += (q < 0) + q = libdivide_s64_shift_right_vec128(q, more & LIBDIVIDE_64_SHIFT_MASK); + q = _mm_add_epi64(q, _mm_srli_epi64(q, 63)); // q += (q < 0) return q; } } -__m128i libdivide_s64_branchfree_do_vector(__m128i numers, const struct libdivide_s64_branchfree_t *denom) { +__m128i libdivide_s64_branchfree_do_vec128( + __m128i numers, const struct libdivide_s64_branchfree_t *denom) { int64_t magic = denom->magic; uint8_t more = denom->more; uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK; // must be arithmetic shift __m128i sign = _mm_set1_epi32((int8_t)more >> 7); - // libdivide_mullhi_s64(numers, magic); - __m128i q = libdivide_mullhi_s64_vector(numers, _mm_set1_epi64x(magic)); - q = _mm_add_epi64(q, numers); // q += numers + // libdivide_mullhi_s64(numers, magic); + __m128i q = libdivide_mullhi_s64_vec128(numers, _mm_set1_epi64x(magic)); + q = _mm_add_epi64(q, numers); // q += numers // If q is non-negative, we have nothing to do. // If q is negative, we want to add either (2**shift)-1 if d is // a power of 2, or (2**shift) if it is not a power of 2. uint32_t is_power_of_2 = (magic == 0); - __m128i q_sign = libdivide_s64_signbits(q); // q_sign = q >> 63 - __m128i mask = _mm_set1_epi64x((1ULL << shift) - is_power_of_2); - q = _mm_add_epi64(q, _mm_and_si128(q_sign, mask)); // q = q + (q_sign & mask) - q = libdivide_s64_shift_right_vector(q, shift); // q >>= shift - q = _mm_sub_epi64(_mm_xor_si128(q, sign), sign); // q = (q ^ sign) - sign + __m128i q_sign = libdivide_s64_signbits_vec128(q); // q_sign = q >> 63 + __m128i mask = _mm_set1_epi64x(((uint64_t)1 << shift) - is_power_of_2); + q = _mm_add_epi64(q, _mm_and_si128(q_sign, mask)); // q = q + (q_sign & mask) + q = libdivide_s64_shift_right_vec128(q, shift); // q >>= shift + q = _mm_sub_epi64(_mm_xor_si128(q, sign), sign); // q = (q ^ sign) - sign return q; } @@ -1969,143 +2963,307 @@ __m128i libdivide_s64_branchfree_do_vector(__m128i numers, const struct libdivid #ifdef __cplusplus -// The C++ divider class is templated on both an integer type -// (like uint64_t) and an algorithm type. -// * BRANCHFULL is the default algorithm type. -// * BRANCHFREE is the branchfree algorithm type. -enum { - BRANCHFULL, - BRANCHFREE +enum Branching { + BRANCHFULL, // use branching algorithms + BRANCHFREE // use branchfree algorithms }; -#if defined(LIBDIVIDE_AVX512) - #define LIBDIVIDE_VECTOR_TYPE __m512i -#elif defined(LIBDIVIDE_AVX2) - #define LIBDIVIDE_VECTOR_TYPE __m256i -#elif defined(LIBDIVIDE_SSE2) - #define LIBDIVIDE_VECTOR_TYPE __m128i +namespace detail { +enum Signedness { + SIGNED, + UNSIGNED, +}; + +#if defined(LIBDIVIDE_NEON) +// Helper to deduce NEON vector type for integral type. +template +struct NeonVec {}; + +template <> +struct NeonVec<16, UNSIGNED> { + typedef uint16x8_t type; +}; + +template <> +struct NeonVec<16, SIGNED> { + typedef int16x8_t type; +}; + +template <> +struct NeonVec<32, UNSIGNED> { + typedef uint32x4_t type; +}; + +template <> +struct NeonVec<32, SIGNED> { + typedef int32x4_t type; +}; + +template <> +struct NeonVec<64, UNSIGNED> { + typedef uint64x2_t type; +}; + +template <> +struct NeonVec<64, SIGNED> { + typedef int64x2_t type; +}; + +template +struct NeonVecFor { + // See 'class divider' for an explanation of these template parameters. + typedef typename NeonVec> 0) > (T)(-1) ? SIGNED : UNSIGNED)>::type type; +}; + +#define LIBDIVIDE_DIVIDE_NEON(ALGO, INT_TYPE) \ + LIBDIVIDE_INLINE typename NeonVecFor::type divide( \ + typename NeonVecFor::type n) const { \ + return libdivide_##ALGO##_do_vec128(n, &denom); \ + } +#else +#define LIBDIVIDE_DIVIDE_NEON(ALGO, INT_TYPE) #endif -#if !defined(LIBDIVIDE_VECTOR_TYPE) - #define LIBDIVIDE_DIVIDE_VECTOR(ALGO) +#if defined(LIBDIVIDE_SSE2) +#define LIBDIVIDE_DIVIDE_SSE2(ALGO) \ + LIBDIVIDE_INLINE __m128i divide(__m128i n) const { \ + return libdivide_##ALGO##_do_vec128(n, &denom); \ + } #else - #define LIBDIVIDE_DIVIDE_VECTOR(ALGO) \ - LIBDIVIDE_VECTOR_TYPE divide(LIBDIVIDE_VECTOR_TYPE n) const { \ - return libdivide_##ALGO##_do_vector(n, &denom); \ - } +#define LIBDIVIDE_DIVIDE_SSE2(ALGO) +#endif + +#if defined(LIBDIVIDE_AVX2) +#define LIBDIVIDE_DIVIDE_AVX2(ALGO) \ + LIBDIVIDE_INLINE __m256i divide(__m256i n) const { \ + return libdivide_##ALGO##_do_vec256(n, &denom); \ + } +#else +#define LIBDIVIDE_DIVIDE_AVX2(ALGO) +#endif + +#if defined(LIBDIVIDE_AVX512) +#define LIBDIVIDE_DIVIDE_AVX512(ALGO) \ + LIBDIVIDE_INLINE __m512i divide(__m512i n) const { \ + return libdivide_##ALGO##_do_vec512(n, &denom); \ + } +#else +#define LIBDIVIDE_DIVIDE_AVX512(ALGO) #endif // The DISPATCHER_GEN() macro generates C++ methods (for the given integer // and algorithm types) that redirect to libdivide's C API. -#define DISPATCHER_GEN(T, ALGO) \ - libdivide_##ALGO##_t denom; \ - dispatcher() { } \ - dispatcher(T d) \ - : denom(libdivide_##ALGO##_gen(d)) \ - { } \ - T divide(T n) const { \ - return libdivide_##ALGO##_do(n, &denom); \ - } \ - LIBDIVIDE_DIVIDE_VECTOR(ALGO) \ - T recover() const { \ - return libdivide_##ALGO##_recover(&denom); \ - } +#define DISPATCHER_GEN(T, ALGO) \ + libdivide_##ALGO##_t denom; \ + LIBDIVIDE_INLINE dispatcher() {} \ + LIBDIVIDE_INLINE dispatcher(T d) : denom(libdivide_##ALGO##_gen(d)) {} \ + LIBDIVIDE_INLINE T divide(T n) const { return libdivide_##ALGO##_do(n, &denom); } \ + LIBDIVIDE_INLINE T recover() const { return libdivide_##ALGO##_recover(&denom); } \ + LIBDIVIDE_DIVIDE_NEON(ALGO, T) \ + LIBDIVIDE_DIVIDE_SSE2(ALGO) \ + LIBDIVIDE_DIVIDE_AVX2(ALGO) \ + LIBDIVIDE_DIVIDE_AVX512(ALGO) // The dispatcher selects a specific division algorithm for a given -// type and ALGO using partial template specialization. -template struct dispatcher { }; +// width, signedness, and ALGO using partial template specialization. +template +struct dispatcher {}; -template<> struct dispatcher { DISPATCHER_GEN(int32_t, s32) }; -template<> struct dispatcher { DISPATCHER_GEN(int32_t, s32_branchfree) }; -template<> struct dispatcher { DISPATCHER_GEN(uint32_t, u32) }; -template<> struct dispatcher { DISPATCHER_GEN(uint32_t, u32_branchfree) }; -template<> struct dispatcher { DISPATCHER_GEN(int64_t, s64) }; -template<> struct dispatcher { DISPATCHER_GEN(int64_t, s64_branchfree) }; -template<> struct dispatcher { DISPATCHER_GEN(uint64_t, u64) }; -template<> struct dispatcher { DISPATCHER_GEN(uint64_t, u64_branchfree) }; +template <> +struct dispatcher<16, SIGNED, BRANCHFULL> { + DISPATCHER_GEN(int16_t, s16) +}; +template <> +struct dispatcher<16, SIGNED, BRANCHFREE> { + DISPATCHER_GEN(int16_t, s16_branchfree) +}; +template <> +struct dispatcher<16, UNSIGNED, BRANCHFULL> { + DISPATCHER_GEN(uint16_t, u16) +}; +template <> +struct dispatcher<16, UNSIGNED, BRANCHFREE> { + DISPATCHER_GEN(uint16_t, u16_branchfree) +}; +template <> +struct dispatcher<32, SIGNED, BRANCHFULL> { + DISPATCHER_GEN(int32_t, s32) +}; +template <> +struct dispatcher<32, SIGNED, BRANCHFREE> { + DISPATCHER_GEN(int32_t, s32_branchfree) +}; +template <> +struct dispatcher<32, UNSIGNED, BRANCHFULL> { + DISPATCHER_GEN(uint32_t, u32) +}; +template <> +struct dispatcher<32, UNSIGNED, BRANCHFREE> { + DISPATCHER_GEN(uint32_t, u32_branchfree) +}; +template <> +struct dispatcher<64, SIGNED, BRANCHFULL> { + DISPATCHER_GEN(int64_t, s64) +}; +template <> +struct dispatcher<64, SIGNED, BRANCHFREE> { + DISPATCHER_GEN(int64_t, s64_branchfree) +}; +template <> +struct dispatcher<64, UNSIGNED, BRANCHFULL> { + DISPATCHER_GEN(uint64_t, u64) +}; +template <> +struct dispatcher<64, UNSIGNED, BRANCHFREE> { + DISPATCHER_GEN(uint64_t, u64_branchfree) +}; +} // namespace detail + +#if defined(LIBDIVIDE_NEON) +// Allow NeonVecFor outside of detail namespace. +template +struct NeonVecFor { + typedef typename detail::NeonVecFor::type type; +}; +#endif // This is the main divider class for use by the user (C++ API). // The actual division algorithm is selected using the dispatcher struct -// based on the integer and algorithm template parameters. -template +// based on the integer width and algorithm template parameters. +template class divider { -public: + private: + // Dispatch based on the size and signedness. + // We avoid using type_traits as it's not available in AVR. + // Detect signedness by checking if T(-1) is less than T(0). + // Also throw in a shift by 0, which prevents floating point types from being passed. + typedef detail::dispatcher> 0) > (T)(-1) ? detail::SIGNED : detail::UNSIGNED), ALGO> + dispatcher_t; + + public: // We leave the default constructor empty so that creating // an array of dividers and then initializing them // later doesn't slow us down. - divider() { } + divider() {} // Constructor that takes the divisor as a parameter - divider(T d) : div(d) { } + LIBDIVIDE_INLINE divider(T d) : div(d) {} // Divides n by the divisor - T divide(T n) const { - return div.divide(n); - } + LIBDIVIDE_INLINE T divide(T n) const { return div.divide(n); } // Recovers the divisor, returns the value that was // used to initialize this divider object. - T recover() const { - return div.recover(); + T recover() const { return div.recover(); } + + bool operator==(const divider &other) const { + return div.denom.magic == other.denom.magic && div.denom.more == other.denom.more; } - bool operator==(const divider& other) const { - return div.denom.magic == other.denom.magic && - div.denom.more == other.denom.more; - } + bool operator!=(const divider &other) const { return !(*this == other); } - bool operator!=(const divider& other) const { - return !(*this == other); - } - -#if defined(LIBDIVIDE_VECTOR_TYPE) - // Treats the vector as packed integer values with the same type as - // the divider (e.g. s32, u32, s64, u64) and divides each of - // them by the divider, returning the packed quotients. - LIBDIVIDE_VECTOR_TYPE divide(LIBDIVIDE_VECTOR_TYPE n) const { + // Vector variants treat the input as packed integer values with the same type as the divider + // (e.g. s32, u32, s64, u64) and divides each of them by the divider, returning the packed + // quotients. +#if defined(LIBDIVIDE_SSE2) + LIBDIVIDE_INLINE __m128i divide(__m128i n) const { return div.divide(n); } +#endif +#if defined(LIBDIVIDE_AVX2) + LIBDIVIDE_INLINE __m256i divide(__m256i n) const { return div.divide(n); } +#endif +#if defined(LIBDIVIDE_AVX512) + LIBDIVIDE_INLINE __m512i divide(__m512i n) const { return div.divide(n); } +#endif +#if defined(LIBDIVIDE_NEON) + LIBDIVIDE_INLINE typename NeonVecFor::type divide(typename NeonVecFor::type n) const { return div.divide(n); } #endif -private: + private: // Storage for the actual divisor - dispatcher::value, - std::is_signed::value, sizeof(T), ALGO> div; + dispatcher_t div; }; // Overload of operator / for scalar division -template -T operator/(T n, const divider& div) { +template +LIBDIVIDE_INLINE T operator/(T n, const divider &div) { return div.divide(n); } // Overload of operator /= for scalar division -template -T& operator/=(T& n, const divider& div) { +template +LIBDIVIDE_INLINE T &operator/=(T &n, const divider &div) { n = div.divide(n); return n; } -#if defined(LIBDIVIDE_VECTOR_TYPE) - // Overload of operator / for vector division - template - LIBDIVIDE_VECTOR_TYPE operator/(LIBDIVIDE_VECTOR_TYPE n, const divider& div) { - return div.divide(n); - } - // Overload of operator /= for vector division - template - LIBDIVIDE_VECTOR_TYPE& operator/=(LIBDIVIDE_VECTOR_TYPE& n, const divider& div) { - n = div.divide(n); - return n; - } +// Overloads for vector types. +#if defined(LIBDIVIDE_SSE2) +template +LIBDIVIDE_INLINE __m128i operator/(__m128i n, const divider &div) { + return div.divide(n); +} + +template +LIBDIVIDE_INLINE __m128i operator/=(__m128i &n, const divider &div) { + n = div.divide(n); + return n; +} +#endif +#if defined(LIBDIVIDE_AVX2) +template +LIBDIVIDE_INLINE __m256i operator/(__m256i n, const divider &div) { + return div.divide(n); +} + +template +LIBDIVIDE_INLINE __m256i operator/=(__m256i &n, const divider &div) { + n = div.divide(n); + return n; +} +#endif +#if defined(LIBDIVIDE_AVX512) +template +LIBDIVIDE_INLINE __m512i operator/(__m512i n, const divider &div) { + return div.divide(n); +} + +template +LIBDIVIDE_INLINE __m512i operator/=(__m512i &n, const divider &div) { + n = div.divide(n); + return n; +} #endif -// libdivdie::branchfree_divider +#if defined(LIBDIVIDE_NEON) +template +LIBDIVIDE_INLINE typename NeonVecFor::type operator/( + typename NeonVecFor::type n, const divider &div) { + return div.divide(n); +} + +template +LIBDIVIDE_INLINE typename NeonVecFor::type operator/=( + typename NeonVecFor::type &n, const divider &div) { + n = div.divide(n); + return n; +} +#endif + +#if __cplusplus >= 201103L || (defined(_MSC_VER) && _MSC_VER >= 1900) +// libdivide::branchfree_divider template using branchfree_divider = divider; +#endif -} // namespace libdivide +} // namespace libdivide -#endif // __cplusplus +#endif // __cplusplus -#endif // LIBDIVIDE_H +#if defined(_MSC_VER) +#pragma warning(pop) +#endif + +#endif // LIBDIVIDE_H diff --git a/src/r_draw.c b/src/r_draw.cpp similarity index 92% rename from src/r_draw.c rename to src/r_draw.cpp index d07323752..24b3c539f 100644 --- a/src/r_draw.c +++ b/src/r_draw.cpp @@ -8,7 +8,7 @@ // terms of the GNU General Public License, version 2. // See the 'LICENSE' file for more details. //----------------------------------------------------------------------------- -/// \file r_draw.c +/// \file r_draw.cpp /// \brief span / column drawer functions, for 8bpp and 16bpp /// All drawing to the view buffer is accomplished in this file. /// The other refresh files only know about ccordinates, @@ -33,24 +33,24 @@ #include "hardware/hw_main.h" #endif +#include // -------------------------------------------- // assembly or c drawer routines for 8bpp/16bpp // -------------------------------------------- coldrawfunc_t *colfunc; + coldrawfunc_t *colfuncs[COLDRAWFUNC_MAX]; -#ifdef USE_COL_SPAN_ASM -coldrawfunc_t *colfuncs_asm[COLDRAWFUNC_MAX]; -#endif +coldrawfunc_t *colfuncs_bm[COLDRAWFUNC_MAX]; + int colfunctype; spandrawfunc_t *spanfunc; spandrawfunc_t *spanfuncs[SPANDRAWFUNC_MAX]; +spandrawfunc_t *spanfuncs_bm[SPANDRAWFUNC_MAX]; spandrawfunc_t *spanfuncs_npo2[SPANDRAWFUNC_MAX]; -#ifdef USE_COL_SPAN_ASM -spandrawfunc_t *spanfuncs_asm[SPANDRAWFUNC_MAX]; -#endif +spandrawfunc_t *spanfuncs_bm_npo2[SPANDRAWFUNC_MAX]; spandrawfunc_t *spanfuncs_flat[SPANDRAWFUNC_MAX]; drawcolumndata_t g_dc; @@ -212,17 +212,17 @@ static void R_AllocateBlendTables(void) { if (i == blendtab_modulate) continue; - blendtables[i] = Z_MallocAlign((NUMTRANSTABLES + 1) * 0x10000, PU_STATIC, NULL, 16); + blendtables[i] = static_cast(Z_MallocAlign((NUMTRANSTABLES + 1) * 0x10000, PU_STATIC, NULL, 16)); } // Modulation blending only requires a single table - blendtables[blendtab_modulate] = Z_MallocAlign(0x10000, PU_STATIC, NULL, 16); + blendtables[blendtab_modulate] = static_cast(Z_MallocAlign(0x10000, PU_STATIC, NULL, 16)); } #ifdef HAVE_THREADS static void R_GenerateBlendTables_Thread(void *userdata) { - struct GenerateBlendTables_State *state = userdata; + struct GenerateBlendTables_State *state = static_cast(userdata); R_GenerateBlendTables_Core(state); @@ -239,8 +239,7 @@ void R_InitTranslucencyTables(void) // Load here the transparency lookup tables 'TINTTAB' // NOTE: the TINTTAB resource MUST BE aligned on 64k for the asm // optimised code (in other words, transtables pointer low word is 0) - transtables = Z_MallocAlign(NUMTRANSTABLES*0x10000, PU_STATIC, - NULL, 16); + transtables = static_cast(Z_MallocAlign(NUMTRANSTABLES*0x10000, PU_STATIC, NULL, 16)); W_ReadLump(W_GetNumForName("TRANS10"), transtables); W_ReadLump(W_GetNumForName("TRANS20"), transtables+0x10000); @@ -260,11 +259,11 @@ void R_GenerateBlendTables(void) { #ifdef HAVE_THREADS // Allocate copies for the worker thread since the originals can be freed in the main thread. - struct GenerateBlendTables_State *state = malloc(sizeof *state); + struct GenerateBlendTables_State *state = static_cast(malloc(sizeof *state)); size_t palsize = 256 * sizeof(RGBA_t); - state->masterPalette = memcpy(malloc(palsize), pMasterPalette, palsize); - state->gammaCorrectedPalette = memcpy(malloc(palsize), pGammaCorrectedPalette, palsize); + state->masterPalette = static_cast(memcpy(malloc(palsize), pMasterPalette, palsize)); + state->gammaCorrectedPalette = static_cast(memcpy(malloc(palsize), pGammaCorrectedPalette, palsize)); I_spawn_thread("blend-tables", R_GenerateBlendTables_Thread, state); @@ -313,7 +312,7 @@ void R_GenerateTranslucencyTable(UINT8 *table, RGBA_t* sourcepal, int style, UIN } } -#define ClipTransLevel(trans) max(min((trans), NUMTRANSMAPS-2), 0) +#define ClipTransLevel(trans) std::clamp(trans, 0, NUMTRANSMAPS-2) UINT8 *R_GetTranslucencyTable(INT32 alphalevel) { @@ -364,7 +363,7 @@ UINT8* R_GetTranslationColormap(INT32 skinnum, skincolornum_t color, UINT8 flags { // Allocate table for skin if necessary if (!translationtablecache[skintableindex]) - translationtablecache[skintableindex] = Z_Calloc(MAXSKINCOLORS * sizeof(UINT8**), PU_STATIC, NULL); + translationtablecache[skintableindex] = static_cast(Z_Calloc(MAXSKINCOLORS * sizeof(UINT8**), PU_STATIC, NULL)); // Get colormap ret = translationtablecache[skintableindex][color]; @@ -383,7 +382,7 @@ UINT8* R_GetTranslationColormap(INT32 skinnum, skincolornum_t color, UINT8 flags // Generate the colormap if necessary if (!ret) { - ret = Z_MallocAlign(NUM_PALETTE_ENTRIES, (flags & GTC_CACHE) ? PU_LEVEL : PU_STATIC, NULL, 8); + ret = static_cast(Z_MallocAlign(NUM_PALETTE_ENTRIES, (flags & GTC_CACHE) ? PU_LEVEL : PU_STATIC, NULL, 8)); K_GenerateKartColormap(ret, skinnum, color); //R_GenerateTranslationColormap(ret, skinnum, color); // SRB2kart // Cache the colormap if desired @@ -425,7 +424,7 @@ UINT16 R_GetColorByName(const char *name) UINT16 R_GetSuperColorByName(const char *name) { UINT16 i, color = SKINCOLOR_NONE; - char *realname = Z_Malloc(MAXCOLORNAME+1, PU_STATIC, NULL); + char *realname = static_cast(Z_Malloc(MAXCOLORNAME+1, PU_STATIC, NULL)); snprintf(realname, MAXCOLORNAME+1, "Super %s 1", name); for (i = 1; i < numskincolors; i++) if (!stricmp(skincolors[i].name, realname)) { @@ -655,17 +654,8 @@ void R_DrawViewBorder(void) #endif // ========================================================================== -// INCLUDE 8bpp DRAWING CODE HERE +// INCLUDE MAIN DRAWERS CODE HERE // ========================================================================== -#include "r_draw8.c" -#include "r_draw8_npo2.c" -#include "r_draw8_flat.c" - -// ========================================================================== -// INCLUDE 16bpp DRAWING CODE HERE -// ========================================================================== - -#ifdef HIGHCOLOR -#include "r_draw16.c" -#endif +#include "r_draw_column.cpp" +#include "r_draw_span.cpp" diff --git a/src/r_draw.h b/src/r_draw.h index 3cc1381e8..643cc6403 100644 --- a/src/r_draw.h +++ b/src/r_draw.h @@ -64,22 +64,20 @@ extern float zeroheight; extern lumpnum_t viewborderlump[8]; - - // --------------------------------------------- // color mode dependent drawer function pointers // --------------------------------------------- -#define USE_COL_SPAN_ASM 0 - #define BASEDRAWFUNC 0 +typedef void (coldrawfunc_t)(drawcolumndata_t*); +typedef void (spandrawfunc_t)(drawspandata_t*); + enum { COLDRAWFUNC_BASE = BASEDRAWFUNC, COLDRAWFUNC_FUZZY, COLDRAWFUNC_TRANS, - COLDRAWFUNC_SHADE, COLDRAWFUNC_SHADOWED, COLDRAWFUNC_TRANSTRANS, COLDRAWFUNC_TWOSMULTIPATCH, @@ -90,15 +88,11 @@ enum COLDRAWFUNC_MAX }; -typedef void (coldrawfunc_t)(drawcolumndata_t*); -typedef void (spandrawfunc_t)(drawspandata_t*); - -extern coldrawfunc_t *colfunc; -extern coldrawfunc_t *colfuncs[COLDRAWFUNC_MAX]; -#ifdef USE_COL_SPAN_ASM -extern coldrawfunc_t *colfuncs_asm[COLDRAWFUNC_MAX]; -#endif extern int colfunctype; +extern coldrawfunc_t *colfunc; + +extern coldrawfunc_t *colfuncs[COLDRAWFUNC_MAX]; +extern coldrawfunc_t *colfuncs_bm[COLDRAWFUNC_MAX]; enum { @@ -120,16 +114,17 @@ enum SPANDRAWFUNC_TILTEDWATER, SPANDRAWFUNC_FOG, + SPANDRAWFUNC_TILTEDFOG, SPANDRAWFUNC_MAX }; extern spandrawfunc_t *spanfunc; + extern spandrawfunc_t *spanfuncs[SPANDRAWFUNC_MAX]; +extern spandrawfunc_t *spanfuncs_bm[SPANDRAWFUNC_MAX]; extern spandrawfunc_t *spanfuncs_npo2[SPANDRAWFUNC_MAX]; -#ifdef USE_COL_SPAN_ASM -extern spandrawfunc_t *spanfuncs_asm[SPANDRAWFUNC_MAX]; -#endif +extern spandrawfunc_t *spanfuncs_bm_npo2[SPANDRAWFUNC_MAX]; extern spandrawfunc_t *spanfuncs_flat[SPANDRAWFUNC_MAX]; // ------------------------------------------------ @@ -202,90 +197,98 @@ void R_DrawViewBorder(void); // 8bpp DRAWING CODE // ----------------- -void R_DrawColumn_8(drawcolumndata_t* dc); -void R_DrawShadeColumn_8(drawcolumndata_t* dc); -void R_DrawTranslucentColumn_8(drawcolumndata_t* dc); -void R_DrawDropShadowColumn_8(drawcolumndata_t* dc); -void R_DrawTranslatedColumn_8(drawcolumndata_t* dc); -void R_DrawTranslatedTranslucentColumn_8(drawcolumndata_t* dc); -void R_Draw2sMultiPatchColumn_8(drawcolumndata_t* dc); -void R_Draw2sMultiPatchTranslucentColumn_8(drawcolumndata_t* dc); -void R_DrawFogColumn_8(drawcolumndata_t* dc); -void R_DrawColumnShadowed_8(drawcolumndata_t* dc); +void R_DrawColumn(drawcolumndata_t* dc); +void R_DrawTranslucentColumn(drawcolumndata_t* dc); +void R_DrawDropShadowColumn(drawcolumndata_t* dc); +void R_DrawTranslatedColumn(drawcolumndata_t* dc); +void R_DrawTranslatedTranslucentColumn(drawcolumndata_t* dc); +void R_Draw2sMultiPatchColumn(drawcolumndata_t* dc); +void R_Draw2sMultiPatchTranslucentColumn(drawcolumndata_t* dc); +void R_DrawFogColumn(drawcolumndata_t* dc); +void R_DrawColumnShadowed(drawcolumndata_t* dc); -#define PLANELIGHTFLOAT (BASEVIDWIDTH * BASEVIDWIDTH / vid.width / ds->zeroheight / 21.0f * FIXED_TO_FLOAT(fovtan[viewssnum])) +void R_DrawColumn_Brightmap(drawcolumndata_t* dc); +void R_DrawTranslucentColumn_Brightmap(drawcolumndata_t* dc); +void R_DrawTranslatedColumn_Brightmap(drawcolumndata_t* dc); +void R_DrawTranslatedTranslucentColumn_Brightmap(drawcolumndata_t* dc); +void R_Draw2sMultiPatchColumn_Brightmap(drawcolumndata_t* dc); +void R_Draw2sMultiPatchTranslucentColumn_Brightmap(drawcolumndata_t* dc); +void R_DrawColumnShadowed_Brightmap(drawcolumndata_t* dc); -void R_DrawSpan_8(drawspandata_t* ds); -void R_DrawTranslucentSpan_8(drawspandata_t* ds); -void R_DrawTiltedSpan_8(drawspandata_t* ds); -void R_DrawTiltedTranslucentSpan_8(drawspandata_t* ds); +void R_DrawSpan(drawspandata_t* ds); +void R_DrawTranslucentSpan(drawspandata_t* ds); +void R_DrawSplat(drawspandata_t* ds); +void R_DrawTranslucentSplat(drawspandata_t* ds); +void R_DrawFloorSprite(drawspandata_t* ds); +void R_DrawTranslucentFloorSprite(drawspandata_t* ds); +void R_DrawTranslucentWaterSpan(drawspandata_t* ds); +void R_DrawFogSpan(drawspandata_t* ds); -void R_DrawSplat_8(drawspandata_t* ds); -void R_DrawTranslucentSplat_8(drawspandata_t* ds); -void R_DrawTiltedSplat_8(drawspandata_t* ds); +void R_DrawSpan_Tilted(drawspandata_t* ds); +void R_DrawTranslucentSpan_Tilted(drawspandata_t* ds); +void R_DrawSplat_Tilted(drawspandata_t* ds); +void R_DrawTranslucentSplat_Tilted(drawspandata_t* ds); +void R_DrawFloorSprite_Tilted(drawspandata_t* ds); +void R_DrawTranslucentFloorSprite_Tilted(drawspandata_t* ds); +void R_DrawTranslucentWaterSpan_Tilted(drawspandata_t* ds); +void R_DrawFogSpan_Tilted(drawspandata_t* ds); -void R_DrawFloorSprite_8(drawspandata_t* ds); -void R_DrawTranslucentFloorSprite_8(drawspandata_t* ds); -void R_DrawTiltedFloorSprite_8(drawspandata_t* ds); -void R_DrawTiltedTranslucentFloorSprite_8(drawspandata_t* ds); +void R_DrawSpan_NPO2(drawspandata_t* ds); +void R_DrawTranslucentSpan_NPO2(drawspandata_t* ds); +void R_DrawSplat_NPO2(drawspandata_t* ds); +void R_DrawTranslucentSplat_NPO2(drawspandata_t* ds); +void R_DrawFloorSprite_NPO2(drawspandata_t* ds); +void R_DrawTranslucentFloorSprite_NPO2(drawspandata_t* ds); +void R_DrawTranslucentWaterSpan_NPO2(drawspandata_t* ds); -void R_CalcTiltedLighting(INT32 *lightbuffer, INT32 x1, INT32 x2, fixed_t start, fixed_t end); +void R_DrawSpan_Tilted_NPO2(drawspandata_t* ds); +void R_DrawTranslucentSpan_Tilted_NPO2(drawspandata_t* ds); +void R_DrawSplat_Tilted_NPO2(drawspandata_t* ds); +void R_DrawTranslucentSplat_Tilted_NPO2(drawspandata_t* ds); +void R_DrawFloorSprite_Tilted_NPO2(drawspandata_t* ds); +void R_DrawTranslucentFloorSprite_Tilted_NPO2(drawspandata_t* ds); +void R_DrawTranslucentWaterSpan_Tilted_NPO2(drawspandata_t* ds); -void R_DrawTranslucentWaterSpan_8(drawspandata_t* ds); -void R_DrawTiltedTranslucentWaterSpan_8(drawspandata_t* ds); +void R_DrawSpan_Brightmap(drawspandata_t* ds); +void R_DrawTranslucentSpan_Brightmap(drawspandata_t* ds); +void R_DrawSplat_Brightmap(drawspandata_t* ds); +void R_DrawTranslucentSplat_Brightmap(drawspandata_t* ds); +void R_DrawFloorSprite_Brightmap(drawspandata_t* ds); +void R_DrawTranslucentFloorSprite_Brightmap(drawspandata_t* ds); +void R_DrawTranslucentWaterSpan_Brightmap(drawspandata_t* ds); -void R_DrawFogSpan_8(drawspandata_t* ds); +void R_DrawSpan_Tilted_Brightmap(drawspandata_t* ds); +void R_DrawTranslucentSpan_Tilted_Brightmap(drawspandata_t* ds); +void R_DrawSplat_Tilted_Brightmap(drawspandata_t* ds); +void R_DrawTranslucentSplat_Tilted_Brightmap(drawspandata_t* ds); +void R_DrawFloorSprite_Tilted_Brightmap(drawspandata_t* ds); +void R_DrawTranslucentFloorSprite_Tilted_Brightmap(drawspandata_t* ds); +void R_DrawTranslucentWaterSpan_Tilted_Brightmap(drawspandata_t* ds); -// Lactozilla: Non-powers-of-two -void R_DrawSpan_NPO2_8(drawspandata_t* ds); -void R_DrawTranslucentSpan_NPO2_8(drawspandata_t* ds); -void R_DrawTiltedSpan_NPO2_8(drawspandata_t* ds); -void R_DrawTiltedTranslucentSpan_NPO2_8(drawspandata_t* ds); +void R_DrawSpan_Brightmap_NPO2(drawspandata_t* ds); +void R_DrawTranslucentSpan_Brightmap_NPO2(drawspandata_t* ds); +void R_DrawSplat_Brightmap_NPO2(drawspandata_t* ds); +void R_DrawTranslucentSplat_Brightmap_NPO2(drawspandata_t* ds); +void R_DrawFloorSprite_Brightmap_NPO2(drawspandata_t* ds); +void R_DrawTranslucentFloorSprite_Brightmap_NPO2(drawspandata_t* ds); +void R_DrawTranslucentWaterSpan_Brightmap_NPO2(drawspandata_t* ds); -void R_DrawSplat_NPO2_8(drawspandata_t* ds); -void R_DrawTranslucentSplat_NPO2_8(drawspandata_t* ds); -void R_DrawTiltedSplat_NPO2_8(drawspandata_t* ds); - -void R_DrawFloorSprite_NPO2_8(drawspandata_t* ds); -void R_DrawTranslucentFloorSprite_NPO2_8(drawspandata_t* ds); -void R_DrawTiltedFloorSprite_NPO2_8(drawspandata_t* ds); -void R_DrawTiltedTranslucentFloorSprite_NPO2_8(drawspandata_t* ds); - -void R_DrawTranslucentWaterSpan_NPO2_8(drawspandata_t* ds); -void R_DrawTiltedTranslucentWaterSpan_NPO2_8(drawspandata_t* ds); +void R_DrawSpan_Tilted_Brightmap_NPO2(drawspandata_t* ds); +void R_DrawTranslucentSpan_Tilted_Brightmap_NPO2(drawspandata_t* ds); +void R_DrawSplat_Tilted_Brightmap_NPO2(drawspandata_t* ds); +void R_DrawTranslucentSplat_Tilted_Brightmap_NPO2(drawspandata_t* ds); +void R_DrawFloorSprite_Tilted_Brightmap_NPO2(drawspandata_t* ds); +void R_DrawTranslucentFloorSprite_Tilted_Brightmap_NPO2(drawspandata_t* ds); +void R_DrawTranslucentWaterSpan_Tilted_Brightmap_NPO2(drawspandata_t* ds); // Debugging - highlight surfaces in flat colors -void R_DrawColumn_Flat_8(drawcolumndata_t* dc); -void R_DrawSpan_Flat_8(drawspandata_t* ds); -void R_DrawTiltedSpan_Flat_8(drawspandata_t* ds); - -#ifdef USEASM -void ASMCALL R_DrawColumn_8_ASM(void); -void ASMCALL R_DrawShadeColumn_8_ASM(void); -void ASMCALL R_DrawTranslucentColumn_8_ASM(void); -void ASMCALL R_Draw2sMultiPatchColumn_8_ASM(void); - -void ASMCALL R_DrawColumn_8_MMX(void); - -void ASMCALL R_Draw2sMultiPatchColumn_8_MMX(void); -void ASMCALL R_DrawSpan_8_MMX(void); -#endif - -// ------------------ -// 16bpp DRAWING CODE -// ------------------ - -#ifdef HIGHCOLOR -void R_DrawColumn_16(void); -void R_DrawWallColumn_16(void); -void R_DrawTranslucentColumn_16(void); -void R_DrawTranslatedColumn_16(void); -void R_DrawSpan_16(void); -#endif +void R_DrawColumn_Flat(drawcolumndata_t* dc); +void R_DrawSpan_Flat(drawspandata_t* ds); +void R_DrawTiltedSpan_Flat(drawspandata_t* ds); #ifdef __cplusplus } // extern "C" -#endif +#endif // __cplusplus // ========================================================================= #endif // __R_DRAW__ diff --git a/src/r_draw16.c b/src/r_draw16.c deleted file mode 100644 index 8b1d29e8d..000000000 --- a/src/r_draw16.c +++ /dev/null @@ -1,214 +0,0 @@ -// SONIC ROBO BLAST 2 -//----------------------------------------------------------------------------- -// Copyright (C) 1998-2000 by DooM Legacy Team. -// Copyright (C) 1999-2020 by Sonic Team Junior. -// -// This program is free software distributed under the -// terms of the GNU General Public License, version 2. -// See the 'LICENSE' file for more details. -//----------------------------------------------------------------------------- -/// \file r_draw16.c -/// \brief 16bpp (HIGHCOLOR) span/column drawer functions -/// \note no includes because this is included as part of r_draw.c - -// ========================================================================== -// COLUMNS -// ========================================================================== - -/// \brief kick out the upper bit of each component (we're in 5 : 5 : 5) -#define HIMASK1 0x7bde - -/** \brief The R_DrawColumn_16 function - standard upto 128high posts column drawer -*/ -void R_DrawColumn_16(void) -{ - INT32 count; - INT16 *dest; - fixed_t frac, fracstep; - - count = dc_yh - dc_yl + 1; - - // Zero length, column does not exceed a pixel. - if (count <= 0) - return; - -#ifdef RANGECHECK - if (dc_x >= vid.width || dc_yl < 0 || dc_yh >= vid.height) - I_Error("R_DrawColumn_16: %d to %d at %d", dc_yl, dc_yh, dc_x); -#endif - - // Framebuffer destination address. - // Use ylookup LUT to avoid multiply with ScreenWidth. - // Use columnofs LUT for subwindows? - dest = (INT16 *)(void *)(ylookup[dc_yl] + columnofs[dc_x]); - - // Determine scaling, which is the only mapping to be done. - fracstep = dc_iscale; - frac = dc_texturemid + (dc_yl - centery)*fracstep; - - // Inner loop that does the actual texture mapping, e.g. a DDA-like scaling. - // This is as fast as it gets. - - do - { - // Re-map color indices from wall texture column using a lighting/special effects LUT. - *dest = hicolormaps[((INT16 *)(void *)dc_source)[(frac>>FRACBITS)&127]>>1]; - - dest += vid.width; - frac += fracstep; - } while (--count); -} - -/** \brief The R_DrawWallColumn_16 function - LAME cutnpaste: same as R_DrawColumn_16 but wraps around 256 - instead of 128 for the tall sky textures (256x240) -*/ -void R_DrawWallColumn_16(void) -{ - INT32 count; - INT16 *dest; - fixed_t frac, fracstep; - - count = dc_yh - dc_yl + 1; - - // Zero length, column does not exceed a pixel. - if (count <= 0) - return; - -#ifdef RANGECHECK - if (dc_x >= vid.width || dc_yl < 0 || dc_yh >= vid.height) - I_Error("R_DrawWallColumn_16: %d to %d at %d", dc_yl, dc_yh, dc_x); -#endif - - dest = (INT16 *)(void *)(ylookup[dc_yl] + columnofs[dc_x]); - - fracstep = dc_iscale; - frac = dc_texturemid + (dc_yl - centery)*fracstep; - - do - { - *dest = hicolormaps[((INT16 *)(void *)dc_source)[(frac>>FRACBITS)&255]>>1]; - - dest += vid.width; - frac += fracstep; - } while (--count); -} - -/** \brief The R_DrawTranslucentColumn_16 function - LAME cutnpaste: same as R_DrawColumn_16 but does - translucent -*/ -void R_DrawTranslucentColumn_16(void) -{ - INT32 count; - INT16 *dest; - fixed_t frac, fracstep; - - // check out coords for src* - if ((dc_yl < 0) || (dc_x >= vid.width)) - return; - - count = dc_yh - dc_yl; - if (count < 0) - return; - -#ifdef RANGECHECK - if (dc_x >= vid.width || dc_yl < 0 || dc_yh >= vid.height) - I_Error("R_DrawTranslucentColumn_16: %d to %d at %d", dc_yl, dc_yh, dc_x); -#endif - - // FIXME. As above. - dest = (INT16 *)(void *)(ylookup[dc_yl] + columnofs[dc_x]); - - // Looks familiar. - fracstep = dc_iscale; - frac = dc_texturemid + (dc_yl - centery)*fracstep; - - // Here we do an additional index re-mapping. - do - { - *dest = (INT16)((INT16)((color8to16[dc_source[frac>>FRACBITS]]>>1) & 0x39ce) - + (INT16)(((*dest & HIMASK1)) & 0x7fff)); - - dest += vid.width; - frac += fracstep; - } while (count--); -} - -/** \brief The R_DrawTranslatedColumn_16 function - ? -*/ -void R_DrawTranslatedColumn_16(void) -{ - INT32 count; - INT16 *dest; - fixed_t frac, fracstep; - - count = dc_yh - dc_yl; - if (count < 0) - return; - -#ifdef RANGECHECK - if (dc_x >= vid.width || dc_yl < 0 || dc_yh >= vid.height) - I_Error("R_DrawTranslatedColumn_16: %d to %d at %d", dc_yl, dc_yh, dc_x); -#endif - - dest = (INT16 *)(void *)(ylookup[dc_yl] + columnofs[dc_x]); - - // Looks familiar. - fracstep = dc_iscale; - frac = dc_texturemid + (dc_yl - centery)*fracstep; - - // Here we do an additional index re-mapping. - do - { - *dest = color8to16[dc_colormap[dc_translation[dc_source[frac>>FRACBITS]]]]; - dest += vid.width; - - frac += fracstep; - } while (count--); -} - -// ========================================================================== -// SPANS -// ========================================================================== - -/** \brief The R_*_16 function - Draws the actual span. -*/ -void R_DrawSpan_16(void) -{ - fixed_t xfrac, yfrac; - INT16 *dest; - INT32 count, spot; - -#ifdef RANGECHECK - if (ds_x2 < ds_x1 || ds_x1 < 0 || ds_x2 >= vid.width || ds_y > vid.height) - I_Error("R_DrawSpan_16: %d to %d at %d", ds_x1, ds_x2, ds_y); -#endif - - xfrac = ds_xfrac; - yfrac = ds_yfrac; - - dest = (INT16 *)(void *)(ylookup[ds_y] + columnofs[ds_x1]); - - // We do not check for zero spans here? - count = ds_x2 - ds_x1; - - if (count <= 0) // We do now! - return; - - do - { - // Current texture index in u, v. - spot = ((yfrac>>(16-6))&(63*64)) + ((xfrac>>16)&63); - - // Lookup pixel from flat texture tile, re-index using light/colormap. - *dest++ = hicolormaps[((INT16 *)(void *)ds_source)[spot]>>1]; - - // Next step in u, v. - xfrac += ds_xstep; - yfrac += ds_ystep; - } while (count--); -} diff --git a/src/r_draw8.c b/src/r_draw8.c deleted file mode 100644 index 8840106e9..000000000 --- a/src/r_draw8.c +++ /dev/null @@ -1,2564 +0,0 @@ -// SONIC ROBO BLAST 2 -//----------------------------------------------------------------------------- -// Copyright (C) 1998-2000 by DooM Legacy Team. -// Copyright (C) 1999-2021 by Sonic Team Junior. -// -// This program is free software distributed under the -// terms of the GNU General Public License, version 2. -// See the 'LICENSE' file for more details. -//----------------------------------------------------------------------------- -/// \file r_draw8.c -/// \brief 8bpp span/column drawer functions -/// \note no includes because this is included as part of r_draw.c - -#include - -// ========================================================================== -// COLUMNS -// ========================================================================== - -// A column is a vertical slice/span of a wall texture that uses -// a has a constant z depth from top to bottom. -// - -/** \brief The R_DrawColumn_8 function - Experiment to make software go faster. Taken from the Boom source -*/ -void R_DrawColumn_8(drawcolumndata_t* dc) -{ - INT32 count; - register UINT8 *dest; - register fixed_t frac; - fixed_t fracstep; - - count = dc->yh - dc->yl; - - if (count < 0) // Zero length, column does not exceed a pixel. - return; - -#ifdef RANGECHECK - if ((unsigned)dc->x >= (unsigned)vid.width || dc->yl < 0 || dc->yh >= vid.height) - return; -#endif - - // Framebuffer destination address. - // Use ylookup LUT to avoid multiply with ScreenWidth. - // Use columnofs LUT for subwindows? - - //dest = ylookup[dc_yl] + columnofs[dc_x]; - dest = &topleft[dc->yl * vid.width + dc->x]; - - count++; - - // Determine scaling, which is the only mapping to be done. - fracstep = dc->iscale; - //frac = dc_texturemid + (dc_yl - centery)*fracstep; - frac = (dc->texturemid + FixedMul((dc->yl << FRACBITS) - centeryfrac, fracstep))*(!dc->hires); - - // Inner loop that does the actual texture mapping, e.g. a DDA-like scaling. - // This is as fast as it gets. - { - register const UINT8 *source = dc->source; - register const UINT8 *brightmap = dc->brightmap; - register const lighttable_t *colormap = dc->colormap; - register const lighttable_t *fullbright = dc->fullbright; - register INT32 heightmask = dc->texheight-1; - if (dc->texheight & heightmask) // not a power of 2 -- killough - { - heightmask++; - heightmask <<= FRACBITS; - - if (frac < 0) - while ((frac += heightmask) < 0); - else - while (frac >= heightmask) - frac -= heightmask; - - do - { - // Re-map color indices from wall texture column - // using a lighting/special effects LUT. - // heightmask is the Tutti-Frutti fix - if (brightmap != NULL && brightmap[frac>>FRACBITS] == BRIGHTPIXEL) - { - *dest = fullbright[source[frac>>FRACBITS]]; - } - else - { - *dest = colormap[source[frac>>FRACBITS]]; - } - dest += vid.width; - - // Avoid overflow. - if (fracstep > 0x7FFFFFFF - frac) - frac += fracstep - heightmask; - else - frac += fracstep; - - while (frac >= heightmask) - frac -= heightmask; - } while (--count); - } - else - { - while ((count -= 2) >= 0) // texture height is a power of 2 - { - if (brightmap != NULL && brightmap[(frac>>FRACBITS) & heightmask] == BRIGHTPIXEL) - { - *dest = fullbright[source[(frac>>FRACBITS) & heightmask]]; - } - else - { - *dest = colormap[source[(frac>>FRACBITS) & heightmask]]; - } - - dest += vid.width; - frac += fracstep; - - if (brightmap != NULL && brightmap[(frac>>FRACBITS) & heightmask] == BRIGHTPIXEL) - { - *dest = fullbright[source[(frac>>FRACBITS) & heightmask]]; - } - else - { - *dest = colormap[source[(frac>>FRACBITS) & heightmask]]; - } - - dest += vid.width; - frac += fracstep; - } - - if (count & 1) - { - if (brightmap != NULL && brightmap[(frac>>FRACBITS) & heightmask] == BRIGHTPIXEL) - { - *dest = fullbright[source[(frac>>FRACBITS) & heightmask]]; - } - else - { - *dest = colormap[source[(frac>>FRACBITS) & heightmask]]; - } - } - } - } -} - -void R_Draw2sMultiPatchColumn_8(drawcolumndata_t* dc) -{ - INT32 count; - register UINT8 *dest; - register fixed_t frac; - fixed_t fracstep; - - count = dc->yh - dc->yl; - - if (count < 0) // Zero length, column does not exceed a pixel. - return; - -#ifdef RANGECHECK - if ((unsigned)dc->x >= (unsigned)vid.width || dc->yl < 0 || dc->yh >= vid.height) - return; -#endif - - // Framebuffer destination address. - // Use ylookup LUT to avoid multiply with ScreenWidth. - // Use columnofs LUT for subwindows? - - //dest = ylookup[dc_yl] + columnofs[dc_x]; - dest = &topleft[dc->yl * vid.width + dc->x]; - - count++; - - // Determine scaling, which is the only mapping to be done. - fracstep = dc->iscale; - //frac = dc_texturemid + (dc_yl - centery)*fracstep; - frac = (dc->texturemid + FixedMul((dc->yl << FRACBITS) - centeryfrac, fracstep))*(!dc->hires); - - // Inner loop that does the actual texture mapping, e.g. a DDA-like scaling. - // This is as fast as it gets. - { - register const UINT8 *source = dc->source; - register const UINT8 *brightmap = dc->brightmap; - register const lighttable_t *colormap = dc->colormap; - register const lighttable_t *fullbright = dc->fullbright; - register INT32 heightmask = dc->texheight-1; - register UINT8 val; - if (dc->texheight & heightmask) // not a power of 2 -- killough - { - heightmask++; - heightmask <<= FRACBITS; - - if (frac < 0) - while ((frac += heightmask) < 0); - else - while (frac >= heightmask) - frac -= heightmask; - - do - { - // Re-map color indices from wall texture column - // using a lighting/special effects LUT. - // heightmask is the Tutti-Frutti fix - val = source[frac>>FRACBITS]; - if (val != TRANSPARENTPIXEL) - { - if (brightmap != NULL && brightmap[frac>>FRACBITS] == BRIGHTPIXEL) - { - *dest = fullbright[val]; - } - else - { - *dest = colormap[val]; - } - } - - dest += vid.width; - - // Avoid overflow. - if (fracstep > 0x7FFFFFFF - frac) - frac += fracstep - heightmask; - else - frac += fracstep; - - while (frac >= heightmask) - frac -= heightmask; - } while (--count); - } - else - { - while ((count -= 2) >= 0) // texture height is a power of 2 - { - val = source[(frac>>FRACBITS) & heightmask]; - if (val != TRANSPARENTPIXEL) - { - if (brightmap != NULL && brightmap[(frac>>FRACBITS) & heightmask] == BRIGHTPIXEL) - { - *dest = fullbright[val]; - } - else - { - *dest = colormap[val]; - } - } - - dest += vid.width; - frac += fracstep; - - val = source[(frac>>FRACBITS) & heightmask]; - if (val != TRANSPARENTPIXEL) - { - if (brightmap != NULL && brightmap[(frac>>FRACBITS) & heightmask] == BRIGHTPIXEL) - { - *dest = fullbright[val]; - } - else - { - *dest = colormap[val]; - } - } - - dest += vid.width; - frac += fracstep; - } - - if (count & 1) - { - val = source[(frac>>FRACBITS) & heightmask]; - if (val != TRANSPARENTPIXEL) - { - if (brightmap != NULL && brightmap[(frac>>FRACBITS) & heightmask] == BRIGHTPIXEL) - { - *dest = fullbright[val]; - } - else - { - *dest = colormap[val]; - } - } - } - } - } -} - -void R_Draw2sMultiPatchTranslucentColumn_8(drawcolumndata_t* dc) -{ - INT32 count; - register UINT8 *dest; - register fixed_t frac; - fixed_t fracstep; - - count = dc->yh - dc->yl; - - if (count < 0) // Zero length, column does not exceed a pixel. - return; - -#ifdef RANGECHECK - if ((unsigned)dc->x >= (unsigned)vid.width || dc->yl < 0 || dc->yh >= vid.height) - return; -#endif - - // Framebuffer destination address. - // Use ylookup LUT to avoid multiply with ScreenWidth. - // Use columnofs LUT for subwindows? - - //dest = ylookup[dc_yl] + columnofs[dc_x]; - dest = &topleft[dc->yl * vid.width + dc->x]; - - count++; - - // Determine scaling, which is the only mapping to be done. - fracstep = dc->iscale; - //frac = dc_texturemid + (dc_yl - centery)*fracstep; - frac = (dc->texturemid + FixedMul((dc->yl << FRACBITS) - centeryfrac, fracstep))*(!dc->hires); - - // Inner loop that does the actual texture mapping, e.g. a DDA-like scaling. - // This is as fast as it gets. - { - register const UINT8 *source = dc->source; - register const UINT8 *brightmap = dc->brightmap; - register const UINT8 *transmap = dc->transmap; - register const lighttable_t *colormap = dc->colormap; - register const lighttable_t *fullbright = dc->fullbright; - register INT32 heightmask = dc->texheight-1; - register UINT8 val; - if (dc->texheight & heightmask) // not a power of 2 -- killough - { - heightmask++; - heightmask <<= FRACBITS; - - if (frac < 0) - while ((frac += heightmask) < 0); - else - while (frac >= heightmask) - frac -= heightmask; - - do - { - // Re-map color indices from wall texture column - // using a lighting/special effects LUT. - // heightmask is the Tutti-Frutti fix - val = source[frac>>FRACBITS]; - if (val != TRANSPARENTPIXEL) - { - if (brightmap != NULL && brightmap[frac>>FRACBITS] == BRIGHTPIXEL) - { - *dest = *(transmap + (fullbright[val]<<8) + (*dest)); - } - else - { - *dest = *(transmap + (colormap[val]<<8) + (*dest)); - } - } - - dest += vid.width; - - // Avoid overflow. - if (fracstep > 0x7FFFFFFF - frac) - frac += fracstep - heightmask; - else - frac += fracstep; - - while (frac >= heightmask) - frac -= heightmask; - } while (--count); - } - else - { - while ((count -= 2) >= 0) // texture height is a power of 2 - { - val = source[(frac>>FRACBITS) & heightmask]; - if (val != TRANSPARENTPIXEL) - { - if (brightmap != NULL && brightmap[(frac>>FRACBITS) & heightmask] == BRIGHTPIXEL) - { - *dest = *(transmap + (fullbright[val]<<8) + (*dest)); - } - else - { - *dest = *(transmap + (colormap[val]<<8) + (*dest)); - } - } - - dest += vid.width; - frac += fracstep; - - val = source[(frac>>FRACBITS) & heightmask]; - if (val != TRANSPARENTPIXEL) - { - if (brightmap != NULL && brightmap[(frac>>FRACBITS) & heightmask] == BRIGHTPIXEL) - { - *dest = *(transmap + (fullbright[val]<<8) + (*dest)); - } - else - { - *dest = *(transmap + (colormap[val]<<8) + (*dest)); - } - } - - dest += vid.width; - frac += fracstep; - } - if (count & 1) - { - val = source[(frac>>FRACBITS) & heightmask]; - if (val != TRANSPARENTPIXEL) - { - if (brightmap != NULL && brightmap[(frac>>FRACBITS) & heightmask] == BRIGHTPIXEL) - { - *dest = *(transmap + (fullbright[val]<<8) + (*dest)); - } - else - { - *dest = *(transmap + (colormap[val]<<8) + (*dest)); - } - } - } - } - } -} - -/** \brief The R_DrawShadeColumn_8 function - Experiment to make software go faster. Taken from the Boom source -*/ -void R_DrawShadeColumn_8(drawcolumndata_t* dc) -{ - register INT32 count; - register UINT8 *dest; - register fixed_t frac, fracstep; - - // check out coords for src* - if ((dc->yl < 0) || (dc->x >= vid.width)) - return; - - count = dc->yh - dc->yl; - if (count < 0) - return; - -#ifdef RANGECHECK - if ((unsigned)dc->x >= (unsigned)vid.width || dc->yl < 0 || dc->yh >= vid.height) - I_Error("R_DrawShadeColumn_8: %d to %d at %d", dc->yl, dc->yh, dc->x); -#endif - - // FIXME. As above. - //dest = ylookup[dc_yl] + columnofs[dc_x]; - dest = &topleft[dc->yl * vid.width + dc->x]; - - // Looks familiar. - fracstep = dc->iscale; - //frac = dc_texturemid + (dc_yl - centery)*fracstep; - frac = (dc->texturemid + FixedMul((dc->yl << FRACBITS) - centeryfrac, fracstep))*(!dc->hires); - - // Here we do an additional index re-mapping. - do - { - *dest = colormaps[(dc->source[frac>>FRACBITS] <<8) + (*dest)]; - dest += vid.width; - frac += fracstep; - } while (count--); -} - -/** \brief The R_DrawTranslucentColumn_8 function - I've made an asm routine for the transparency, because it slows down - a lot in 640x480 with big sprites (bfg on all screen, or transparent - walls on fullscreen) -*/ -void R_DrawTranslucentColumn_8(drawcolumndata_t* dc) -{ - register INT32 count; - register UINT8 *dest; - register fixed_t frac, fracstep; - - count = dc->yh - dc->yl + 1; - - if (count <= 0) // Zero length, column does not exceed a pixel. - return; - -#ifdef RANGECHECK - if ((unsigned)dc->x >= (unsigned)vid.width || dc->yl < 0 || dc->yh >= vid.height) - I_Error("R_DrawTranslucentColumn_8: %d to %d at %d", dc->yl, dc->yh, dc->x); -#endif - - // FIXME. As above. - //dest = ylookup[dc_yl] + columnofs[dc_x]; - dest = &topleft[dc->yl * vid.width + dc->x]; - - // Looks familiar. - fracstep = dc->iscale; - //frac = dc_texturemid + (dc_yl - centery)*fracstep; - frac = (dc->texturemid + FixedMul((dc->yl << FRACBITS) - centeryfrac, fracstep))*(!dc->hires); - - // Inner loop that does the actual texture mapping, e.g. a DDA-like scaling. - // This is as fast as it gets. - { - register const UINT8 *source = dc->source; - register const UINT8 *brightmap = dc->brightmap; - register const UINT8 *transmap = dc->transmap; - register const lighttable_t *colormap = dc->colormap; - register const lighttable_t *fullbright = dc->fullbright; - register INT32 heightmask = dc->texheight - 1; - if (dc->texheight & heightmask) - { - heightmask++; - heightmask <<= FRACBITS; - - if (frac < 0) - while ((frac += heightmask) < 0) - ; - else - while (frac >= heightmask) - frac -= heightmask; - - do - { - // Re-map color indices from wall texture column - // using a lighting/special effects LUT. - // heightmask is the Tutti-Frutti fix - if (brightmap != NULL && brightmap[frac>>FRACBITS] == BRIGHTPIXEL) - { - *dest = *(transmap + (fullbright[source[frac>>FRACBITS]]<<8) + (*dest)); - } - else - { - *dest = *(transmap + (colormap[source[frac>>FRACBITS]]<<8) + (*dest)); - } - dest += vid.width; - if ((frac += fracstep) >= heightmask) - frac -= heightmask; - } - while (--count); - } - else - { - while ((count -= 2) >= 0) // texture height is a power of 2 - { - if (brightmap != NULL && brightmap[(frac>>FRACBITS)&heightmask] == BRIGHTPIXEL) - { - *dest = *(transmap + (fullbright[source[(frac>>FRACBITS)&heightmask]]<<8) + (*dest)); - } - else - { - *dest = *(transmap + (colormap[source[(frac>>FRACBITS)&heightmask]]<<8) + (*dest)); - } - dest += vid.width; - frac += fracstep; - - if (brightmap != NULL && brightmap[(frac>>FRACBITS)&heightmask] == BRIGHTPIXEL) - { - *dest = *(transmap + (fullbright[source[(frac>>FRACBITS)&heightmask]]<<8) + (*dest)); - } - else - { - *dest = *(transmap + (colormap[source[(frac>>FRACBITS)&heightmask]]<<8) + (*dest)); - } - dest += vid.width; - frac += fracstep; - } - if (count & 1) - { - if (brightmap != NULL && brightmap[(frac>>FRACBITS)&heightmask] == BRIGHTPIXEL) - { - *dest = *(transmap + (fullbright[source[(frac>>FRACBITS)&heightmask]]<<8) + (*dest)); - } - else - { - *dest = *(transmap + (colormap[source[(frac>>FRACBITS)&heightmask]]<<8) + (*dest)); - } - } - } - } -} - -// Hack: A cut-down copy of R_DrawTranslucentColumn_8 that does not read texture -// data since something about calculating the texture reading address for drop shadows is broken. -// dc_texturemid and dc_iscale get wrong values for drop shadows, however those are not strictly -// needed for the current design of the shadows, so this function bypasses the issue -// by not using those variables at all. -void R_DrawDropShadowColumn_8(drawcolumndata_t* dc) -{ - register INT32 count; - register UINT8 *dest; - - count = dc->yh - dc->yl + 1; - - if (count <= 0) // Zero length, column does not exceed a pixel. - return; - - dest = &topleft[dc->yl*vid.width + dc->x]; - - { - register const UINT8 *transmap_offset = dc->transmap + (dc->shadowcolor << 8); - while ((count -= 2) >= 0) - { - *dest = *(transmap_offset + (*dest)); - dest += vid.width; - *dest = *(transmap_offset + (*dest)); - dest += vid.width; - } - if (count & 1) - *dest = *(transmap_offset + (*dest)); - } -} - -/** \brief The R_DrawTranslatedTranslucentColumn_8 function - Spiffy function. Not only does it colormap a sprite, but does translucency as well. - Uber-kudos to Cyan Helkaraxe -*/ -void R_DrawTranslatedTranslucentColumn_8(drawcolumndata_t* dc) -{ - register INT32 count; - register UINT8 *dest; - register fixed_t frac, fracstep; - - count = dc->yh - dc->yl + 1; - - if (count <= 0) // Zero length, column does not exceed a pixel. - return; - - // FIXME. As above. - //dest = ylookup[dc_yl] + columnofs[dc_x]; - dest = &topleft[dc->yl * vid.width + dc->x]; - - // Looks familiar. - fracstep = dc->iscale; - //frac = dc_texturemid + (dc_yl - centery)*fracstep; - frac = (dc->texturemid + FixedMul((dc->yl << FRACBITS) - centeryfrac, fracstep))*(!dc->hires); - - // Inner loop that does the actual texture mapping, e.g. a DDA-like scaling. - // This is as fast as it gets. - { - register INT32 heightmask = dc->texheight - 1; - if (dc->texheight & heightmask) - { - heightmask++; - heightmask <<= FRACBITS; - - if (frac < 0) - while ((frac += heightmask) < 0) - ; - else - while (frac >= heightmask) - frac -= heightmask; - - do - { - // Re-map color indices from wall texture column - // using a lighting/special effects LUT. - // heightmask is the Tutti-Frutti fix - - if (dc->brightmap != NULL && dc->brightmap[frac>>FRACBITS] == BRIGHTPIXEL) - { - *dest = *(dc->transmap + (dc->fullbright[dc->translation[dc->source[frac>>FRACBITS]]]<<8) + (*dest)); - } - else - { - *dest = *(dc->transmap + (dc->colormap[dc->translation[dc->source[frac>>FRACBITS]]]<<8) + (*dest)); - } - - dest += vid.width; - if ((frac += fracstep) >= heightmask) - frac -= heightmask; - } - while (--count); - } - else - { - while ((count -= 2) >= 0) // texture height is a power of 2 - { - if (dc->brightmap != NULL && dc->brightmap[(frac>>FRACBITS)&heightmask] == BRIGHTPIXEL) - { - *dest = *(dc->transmap + (dc->fullbright[dc->translation[dc->source[(frac>>FRACBITS)&heightmask]]]<<8) + (*dest)); - } - else - { - *dest = *(dc->transmap + (dc->colormap[dc->translation[dc->source[(frac>>FRACBITS)&heightmask]]]<<8) + (*dest)); - } - - dest += vid.width; - frac += fracstep; - - if (dc->brightmap != NULL && dc->brightmap[(frac>>FRACBITS)&heightmask] == BRIGHTPIXEL) - { - *dest = *(dc->transmap + (dc->fullbright[dc->translation[dc->source[(frac>>FRACBITS)&heightmask]]]<<8) + (*dest)); - } - else - { - *dest = *(dc->transmap + (dc->colormap[dc->translation[dc->source[(frac>>FRACBITS)&heightmask]]]<<8) + (*dest)); - } - - dest += vid.width; - frac += fracstep; - } - if (count & 1) - { - if (dc->brightmap != NULL && dc->brightmap[(frac>>FRACBITS)&heightmask] == BRIGHTPIXEL) - { - *dest = *(dc->transmap + (dc->fullbright[dc->translation[dc->source[(frac>>FRACBITS)&heightmask]]]<<8) + (*dest)); - } - else - { - *dest = *(dc->transmap + (dc->colormap[dc->translation[dc->source[(frac>>FRACBITS)&heightmask]]]<<8) + (*dest)); - } - } - } - } -} - -/** \brief The R_DrawTranslatedColumn_8 function - Draw columns up to 128 high but remap the green ramp to other colors - - \warning STILL NOT IN ASM, TO DO.. -*/ -void R_DrawTranslatedColumn_8(drawcolumndata_t* dc) -{ - register INT32 count; - register UINT8 *dest; - register fixed_t frac, fracstep; - - count = dc->yh - dc->yl; - if (count < 0) - return; - -#ifdef RANGECHECK - if ((unsigned)dc->x >= (unsigned)vid.width || dc->yl < 0 || dc->yh >= vid.height) - I_Error("R_DrawTranslatedColumn_8: %d to %d at %d", dc->yl, dc->yh, dc->x); -#endif - - // FIXME. As above. - //dest = ylookup[dc_yl] + columnofs[dc_x]; - dest = &topleft[dc->yl*vid.width + dc->x]; - - // Looks familiar. - fracstep = dc->iscale; - //frac = dc_texturemid + (dc_yl-centery)*fracstep; - frac = (dc->texturemid + FixedMul((dc->yl << FRACBITS) - centeryfrac, fracstep))*(!dc->hires); - - // Here we do an additional index re-mapping. - do - { - // Translation tables are used - // to map certain colorramps to other ones, - // used with PLAY sprites. - // Thus the "green" ramp of the player 0 sprite - // is mapped to gray, red, black/indigo. - if (dc->brightmap != NULL && dc->brightmap[frac>>FRACBITS] == BRIGHTPIXEL) - { - *dest = dc->fullbright[dc->translation[dc->source[frac>>FRACBITS]]]; - } - else - { - *dest = dc->colormap[dc->translation[dc->source[frac>>FRACBITS]]]; - } - - dest += vid.width; - - frac += fracstep; - } while (count--); -} - -// ========================================================================== -// SPANS -// ========================================================================== - -#define SPANSIZE 16 -#define INVSPAN 0.0625f - -// 4194303 = (2048x2048)-1 (2048x2048 is maximum flat size) -#define MAXFLATBYTES 4194303 - -/** \brief The R_DrawSpan_8 function - Draws the actual span. -*/ -void R_DrawSpan_8 (drawspandata_t* ds) -{ - fixed_t xposition; - fixed_t yposition; - fixed_t xstep, ystep; - UINT32 bit; - - UINT8 *source; - UINT8 *brightmap; - UINT8 *colormap; - UINT8 *fullbright; - UINT8 *dest; - const UINT8 *deststop = screens[0] + vid.rowbytes * vid.height; - - size_t count = (ds->x2 - ds->x1 + 1); - size_t i; - - xposition = ds->xfrac; yposition = ds->yfrac; - xstep = ds->xstep; ystep = ds->ystep; - - // SoM: we only need 6 bits for the integer part (0 thru 63) so the rest - // can be used for the fraction part. This allows calculation of the memory address in the - // texture with two shifts, an OR and one AND. (see below) - // for texture sizes > 64 the amount of precision we can allow will decrease, but only by one - // bit per power of two (obviously) - // Ok, because I was able to eliminate the variable spot below, this function is now FASTER - // than the original span renderer. Whodathunkit? - xposition <<= ds->nflatshiftup; yposition <<= ds->nflatshiftup; - xstep <<= ds->nflatshiftup; ystep <<= ds->nflatshiftup; - - source = ds->source; - brightmap = ds->brightmap; - colormap = ds->colormap; - fullbright = ds->fullbright; - dest = ylookup[ds->y] + columnofs[ds->x1]; - - if (dest+8 > deststop) - return; - - while (count >= 8) - { - // SoM: Why didn't I see this earlier? the spot variable is a waste now because we don't - // have the uber complicated math to calculate it now, so that was a memory write we didn't - // need! - - for (i = 0; i < 8; i++) - { - bit = (((UINT32)yposition >> ds->nflatyshift) & ds->nflatmask) | ((UINT32)xposition >> ds->nflatxshift); - if (brightmap != NULL && brightmap[bit] == BRIGHTPIXEL) - { - dest[i] = fullbright[source[bit]]; - } - else - { - dest[i] = colormap[source[bit]]; - } - xposition += xstep; - yposition += ystep; - } - - dest += 8; - count -= 8; - } - while (count-- && dest <= deststop) - { - bit = (((UINT32)yposition >> ds->nflatyshift) & ds->nflatmask) | ((UINT32)xposition >> ds->nflatxshift); - if (brightmap != NULL && brightmap[bit] == BRIGHTPIXEL) - { - *dest = fullbright[source[bit]]; - } - else - { - *dest = colormap[source[bit]]; - } - - dest++; - xposition += xstep; - yposition += ystep; - } -} - -// R_CalcTiltedLighting -// Exactly what it says on the tin. I wish I wasn't too lazy to explain things properly. -void R_CalcTiltedLighting(INT32 *lightbuffer, INT32 x1, INT32 x2, fixed_t start, fixed_t end) -{ - // ZDoom uses a different lighting setup to us, and I couldn't figure out how to adapt their version - // of this function. Here's my own. - INT32 left = x1, right = x2; - fixed_t step = (end-start)/(x2 - x1 + 1); - INT32 i; - - // I wanna do some optimizing by checking for out-of-range segments on either side to fill in all at once, - // but I'm too bad at coding to not crash the game trying to do that. I guess this is fast enough for now... - - for (i = left; i <= right; i++) { - lightbuffer[i] = (start += step) >> FRACBITS; - if (lightbuffer[i] < 0) - lightbuffer[i] = 0; - else if (lightbuffer[i] >= MAXLIGHTSCALE) - lightbuffer[i] = MAXLIGHTSCALE-1; - } -} - -/** \brief The R_DrawTiltedSpan_8 function - Draw slopes! Holy sheit! -*/ -void R_DrawTiltedSpan_8(drawspandata_t* ds) -{ - // x1, x2 = ds_x1, ds_x2 - int width = ds->x2 - ds->x1; - double iz, uz, vz; - UINT32 u, v; - int i; - - UINT8 *source; - UINT8 *brightmap; - UINT8 *colormap; - UINT8 *fullbright; - UINT8 *dest; - - double startz, startu, startv; - double izstep, uzstep, vzstep; - double endz, endu, endv; - UINT32 stepu, stepv; - UINT32 bit; - INT32 tiltlighting[MAXVIDWIDTH]; - - iz = ds->szp.z + ds->szp.y*(centery-ds->y) + ds->szp.x*(ds->x1-centerx); - - // Lighting is simple. It's just linear interpolation from start to end - { - float planelightfloat = PLANELIGHTFLOAT; - float lightstart, lightend; - - lightend = (iz + ds->szp.x*width) * planelightfloat; - lightstart = iz * planelightfloat; - - R_CalcTiltedLighting(tiltlighting, ds->x1, ds->x2, FLOAT_TO_FIXED(lightstart), FLOAT_TO_FIXED(lightend)); - //CONS_Printf("tilted lighting %f to %f (foc %f)\n", lightstart, lightend, focallengthf); - } - - uz = ds->sup.z + ds->sup.y*(centery-ds->y) + ds->sup.x*(ds->x1-centerx); - vz = ds->svp.z + ds->svp.y*(centery-ds->y) + ds->svp.x*(ds->x1-centerx); - - dest = ylookup[ds->y] + columnofs[ds->x1]; - - source = ds->source; - brightmap = ds->brightmap; - //colormap = ds_colormap; - fullbright = ds->fullbright; - -#if 0 // The "perfect" reference version of this routine. Pretty slow. - // Use it only to see how things are supposed to look. - i = 0; - do - { - double z = 1.f/iz; - u = (INT64)(uz*z); - v = (INT64)(vz*z); - bit = ((v >> nflatyshift) & nflatmask) | (u >> nflatxshift); - if (brightmap != NULL && brightmap[bit] == BRIGHTPIXEL) - { - *dest = fullbright[source[bit]]; - } - else - { - colormap = planezlight[tiltlighting[ds_x1]] + (ds_colormap - colormaps); - *dest = colormap[source[bit]]; - } - dest++; - ds_x1++; - iz += ds_szp->x; - uz += ds_sup->x; - vz += ds_svp->x; - } while (--width >= 0); -#else - startz = 1.f/iz; - startu = uz*startz; - startv = vz*startz; - - izstep = ds->szp.x * SPANSIZE; - uzstep = ds->sup.x * SPANSIZE; - vzstep = ds->svp.x * SPANSIZE; - //x1 = 0; - width++; - - while (width >= SPANSIZE) - { - iz += izstep; - uz += uzstep; - vz += vzstep; - - endz = 1.f/iz; - endu = uz*endz; - endv = vz*endz; - stepu = (INT64)((endu - startu) * INVSPAN); - stepv = (INT64)((endv - startv) * INVSPAN); - u = (INT64)(startu); - v = (INT64)(startv); - - for (i = SPANSIZE-1; i >= 0; i--) - { - bit = ((v >> ds->nflatyshift) & ds->nflatmask) | (u >> ds->nflatxshift); - if (brightmap != NULL && brightmap[bit] == BRIGHTPIXEL) - { - *dest = fullbright[source[bit]]; - } - else - { - colormap = ds->planezlight[tiltlighting[ds->x1]] + (ds->colormap - colormaps); - *dest = colormap[source[bit]]; - } - dest++; - ds->x1++; - u += stepu; - v += stepv; - } - startu = endu; - startv = endv; - width -= SPANSIZE; - } - if (width > 0) - { - if (width == 1) - { - u = (INT64)(startu); - v = (INT64)(startv); - bit = ((v >> ds->nflatyshift) & ds->nflatmask) | (u >> ds->nflatxshift); - if (brightmap != NULL && brightmap[bit] == BRIGHTPIXEL) - { - *dest = fullbright[source[bit]]; - } - else - { - colormap = ds->planezlight[tiltlighting[ds->x1]] + (ds->colormap - colormaps); - *dest = colormap[source[bit]]; - } - ds->x1++; - } - else - { - double left = width; - iz += ds->szp.x * left; - uz += ds->sup.x * left; - vz += ds->svp.x * left; - - endz = 1.f/iz; - endu = uz*endz; - endv = vz*endz; - left = 1.f/left; - stepu = (INT64)((endu - startu) * left); - stepv = (INT64)((endv - startv) * left); - u = (INT64)(startu); - v = (INT64)(startv); - - for (; width != 0; width--) - { - bit = ((v >> ds->nflatyshift) & ds->nflatmask) | (u >> ds->nflatxshift); - if (brightmap != NULL && brightmap[bit] == BRIGHTPIXEL) - { - *dest = fullbright[source[bit]]; - } - else - { - colormap = ds->planezlight[tiltlighting[ds->x1]] + (ds->colormap - colormaps); - *dest = colormap[source[bit]]; - } - dest++; - ds->x1++; - u += stepu; - v += stepv; - } - } - } -#endif -} - -/** \brief The R_DrawTiltedTranslucentSpan_8 function - Like DrawTiltedSpan, but translucent -*/ -void R_DrawTiltedTranslucentSpan_8(drawspandata_t* ds) -{ - TracyCZone(__zone, true); - // x1, x2 = ds_x1, ds_x2 - int width = ds->x2 - ds->x1; - double iz, uz, vz; - UINT32 u, v; - int i; - - UINT8 *source; - UINT8 *brightmap; - UINT8 *colormap; - UINT8 *fullbright; - UINT8 *dest; - - double startz, startu, startv; - double izstep, uzstep, vzstep; - double endz, endu, endv; - UINT32 stepu, stepv; - UINT32 bit; - INT32 tiltlighting[MAXVIDWIDTH]; - - INT32 x1 = ds->x1; - const INT32 nflatxshift = ds->nflatxshift; - const INT32 nflatyshift = ds->nflatyshift; - const INT32 nflatmask = ds->nflatmask; - UINT8 *transmap = ds->transmap; - lighttable_t **planezlight = ds->planezlight; - lighttable_t *ds_colormap = ds->colormap; - - iz = ds->szp.z + ds->szp.y*(centery-ds->y) + ds->szp.x*(ds->x1-centerx); - - // Lighting is simple. It's just linear interpolation from start to end - { - float planelightfloat = PLANELIGHTFLOAT; - float lightstart, lightend; - - lightend = (iz + ds->szp.x*width) * planelightfloat; - lightstart = iz * planelightfloat; - - R_CalcTiltedLighting(tiltlighting, ds->x1, ds->x2, FLOAT_TO_FIXED(lightstart), FLOAT_TO_FIXED(lightend)); - //CONS_Printf("tilted lighting %f to %f (foc %f)\n", lightstart, lightend, focallengthf); - } - - uz = ds->sup.z + ds->sup.y*(centery-ds->y) + ds->sup.x*(ds->x1-centerx); - vz = ds->svp.z + ds->svp.y*(centery-ds->y) + ds->svp.x*(ds->x1-centerx); - - dest = ylookup[ds->y] + columnofs[ds->x1]; - - source = ds->source; - brightmap = ds->brightmap; - //colormap = ds_colormap; - fullbright = ds->fullbright; - -#if 0 // The "perfect" reference version of this routine. Pretty slow. - // Use it only to see how things are supposed to look. - i = 0; - do - { - double z = 1.f/iz; - u = (INT64)(uz*z); - v = (INT64)(vz*z); - - bit = ((v >> nflatyshift) & nflatmask) | (u >> nflatxshift); - if (brightmap != NULL && brightmap[bit] == BRIGHTPIXEL) - { - *dest = *(ds_transmap + (fullbright[source[bit]] << 8) + *dest); - } - else - { - colormap = planezlight[tiltlighting[ds_x1]] + (ds_colormap - colormaps); - *dest = *(ds_transmap + (colormap[source[bit]] << 8) + *dest); - } - dest++; - ds_x1++; - iz += ds_szp->x; - uz += ds_sup->x; - vz += ds_svp->x; - } while (--width >= 0); -#else - startz = 1.f/iz; - startu = uz*startz; - startv = vz*startz; - - izstep = ds->szp.x * SPANSIZE; - uzstep = ds->sup.x * SPANSIZE; - vzstep = ds->svp.x * SPANSIZE; - //x1 = 0; - width++; - - while (width >= SPANSIZE) - { - iz += izstep; - uz += uzstep; - vz += vzstep; - - endz = 1.f/iz; - endu = uz*endz; - endv = vz*endz; - stepu = (INT64)((endu - startu) * INVSPAN); - stepv = (INT64)((endv - startv) * INVSPAN); - u = (INT64)(startu); - v = (INT64)(startv); - - x1 = ds->x1; - - for (i = 0; i < SPANSIZE; i++) - { - bit = (((v + stepv * i) >> nflatyshift) & nflatmask) | ((u + stepu * i) >> nflatxshift); - if (brightmap != NULL && brightmap[bit] == BRIGHTPIXEL) - { - dest[i] = *(transmap + (fullbright[source[bit]] << 8) + dest[i]); - } - else - { - colormap = planezlight[tiltlighting[x1 + i]] + (ds_colormap - colormaps); - dest[i] = *(transmap + (colormap[source[bit]] << 8) + dest[i]); - } - } - ds->x1 += SPANSIZE; - dest += SPANSIZE; - startu = endu; - startv = endv; - width -= SPANSIZE; - } - if (width > 0) - { - if (width == 1) - { - u = (INT64)(startu); - v = (INT64)(startv); - bit = ((v >> nflatyshift) & nflatmask) | (u >> nflatxshift); - if (brightmap != NULL && brightmap[bit] == BRIGHTPIXEL) - { - *dest = *(transmap + (fullbright[source[bit]] << 8) + *dest); - } - else - { - colormap = planezlight[tiltlighting[ds->x1]] + (ds_colormap - colormaps); - *dest = *(transmap + (colormap[source[bit]] << 8) + *dest); - } - ds->x1++; - } - else - { - double left = width; - iz += ds->szp.x * left; - uz += ds->sup.x * left; - vz += ds->svp.x * left; - - endz = 1.f/iz; - endu = uz*endz; - endv = vz*endz; - left = 1.f/left; - stepu = (INT64)((endu - startu) * left); - stepv = (INT64)((endv - startv) * left); - u = (INT64)(startu); - v = (INT64)(startv); - - for (; width != 0; width--) - { - bit = ((v >> nflatyshift) & nflatmask) | (u >> nflatxshift);; - if (brightmap != NULL && brightmap[bit] == BRIGHTPIXEL) - { - *dest = *(transmap + (fullbright[source[bit]] << 8) + *dest); - } - else - { - colormap = planezlight[tiltlighting[ds->x1]] + (ds_colormap - colormaps); - *dest = *(transmap + (colormap[source[bit]] << 8) + *dest); - } - dest++; - ds->x1++; - u += stepu; - v += stepv; - } - } - } -#endif - TracyCZoneEnd(__zone); -} - -/** \brief The R_DrawTiltedTranslucentWaterSpan_8 function - Like DrawTiltedTranslucentSpan, but for water -*/ -void R_DrawTiltedTranslucentWaterSpan_8(drawspandata_t* ds) -{ - TracyCZone(__zone, true); - // x1, x2 = ds_x1, ds_x2 - int width = ds->x2 - ds->x1; - double iz, uz, vz; - UINT32 u, v; - int i; - - UINT8 *source; - UINT8 *brightmap; - UINT8 *colormap; - UINT8 *fullbright; - UINT8 *dest; - UINT8 *dsrc; - - double startz, startu, startv; - double izstep, uzstep, vzstep; - double endz, endu, endv; - UINT32 stepu, stepv; - UINT32 bit; - INT32 tiltlighting[MAXVIDWIDTH]; - - INT32 x1 = ds->x1; - const INT32 nflatxshift = ds->nflatxshift; - const INT32 nflatyshift = ds->nflatyshift; - const INT32 nflatmask = ds->nflatmask; - UINT8 *transmap = ds->transmap; - lighttable_t **planezlight = ds->planezlight; - lighttable_t *ds_colormap = ds->colormap; - - iz = ds->szp.z + ds->szp.y*(centery-ds->y) + ds->szp.x*(ds->x1-centerx); - - // Lighting is simple. It's just linear interpolation from start to end - { - float planelightfloat = PLANELIGHTFLOAT; - float lightstart, lightend; - - lightend = (iz + ds->szp.x*width) * planelightfloat; - lightstart = iz * planelightfloat; - - R_CalcTiltedLighting(tiltlighting, ds->x1, ds->x2, FLOAT_TO_FIXED(lightstart), FLOAT_TO_FIXED(lightend)); - //CONS_Printf("tilted lighting %f to %f (foc %f)\n", lightstart, lightend, focallengthf); - } - - uz = ds->sup.z + ds->sup.y*(centery-ds->y) + ds->sup.x*(ds->x1-centerx); - vz = ds->svp.z + ds->svp.y*(centery-ds->y) + ds->svp.x*(ds->x1-centerx); - - dest = ylookup[ds->y] + columnofs[ds->x1]; - dsrc = screens[1] + (ds->y+ds->bgofs)*vid.width + ds->x1; - source = ds->source; - brightmap = ds->brightmap; - //colormap = ds_colormap; - fullbright = ds->fullbright; - -#if 0 // The "perfect" reference version of this routine. Pretty slow. - // Use it only to see how things are supposed to look. - i = 0; - do - { - double z = 1.f/iz; - u = (INT64)(uz*z); - v = (INT64)(vz*z); - - bit = ((v >> nflatyshift) & nflatmask) | (u >> nflatxshift); - if (brightmap != NULL && brightmap[bit] == BRIGHTPIXEL) - { - *dest = *(ds_transmap + (fullbright[source[bit]] << 8) + *dsrc); - } - else - { - colormap = planezlight[tiltlighting[ds_x1]] + (ds_colormap - colormaps); - *dest = *(ds_transmap + (colormap[source[bit]] << 8) + *dsrc); - } - dest++; - ds_x1++; - dsrc++; - iz += ds_szp->x; - uz += ds_sup->x; - vz += ds_svp->x; - } while (--width >= 0); -#else - startz = 1.f/iz; - startu = uz*startz; - startv = vz*startz; - - izstep = ds->szp.x * SPANSIZE; - uzstep = ds->sup.x * SPANSIZE; - vzstep = ds->svp.x * SPANSIZE; - //x1 = 0; - width++; - - while (width >= SPANSIZE) - { - iz += izstep; - uz += uzstep; - vz += vzstep; - - endz = 1.f/iz; - endu = uz*endz; - endv = vz*endz; - stepu = (INT64)((endu - startu) * INVSPAN); - stepv = (INT64)((endv - startv) * INVSPAN); - u = (INT64)(startu); - v = (INT64)(startv); - - x1 = ds->x1; - - for (i = 0; i < SPANSIZE; i++) - { - bit = (((v + stepv * i) >> nflatyshift) & nflatmask) | ((u + stepu * i) >> nflatxshift); - if (brightmap != NULL && brightmap[bit] == BRIGHTPIXEL) - { - dest[i] = transmap[(fullbright[source[bit]] << 8) + dsrc[i]]; - } - else - { - colormap = planezlight[tiltlighting[x1 + i]] + (ds_colormap - colormaps); - dest[i] = transmap[(colormap[source[bit]] << 8) + dsrc[i]]; - } - } - ds->x1 += SPANSIZE; - dest += SPANSIZE; - dsrc += SPANSIZE; - startu = endu; - startv = endv; - width -= SPANSIZE; - } - if (width > 0) - { - if (width == 1) - { - u = (INT64)(startu); - v = (INT64)(startv); - bit = ((v >> nflatyshift) & nflatmask) | (u >> nflatxshift); - if (brightmap != NULL && brightmap[bit] == BRIGHTPIXEL) - { - *dest = *(transmap + (fullbright[source[bit]] << 8) + *dsrc); - } - else - { - colormap = planezlight[tiltlighting[ds->x1]] + (ds_colormap - colormaps); - *dest = *(transmap + (colormap[source[bit]] << 8) + *dsrc); - } - ds->x1++; - } - else - { - double left = width; - iz += ds->szp.x * left; - uz += ds->sup.x * left; - vz += ds->svp.x * left; - - endz = 1.f/iz; - endu = uz*endz; - endv = vz*endz; - left = 1.f/left; - stepu = (INT64)((endu - startu) * left); - stepv = (INT64)((endv - startv) * left); - u = (INT64)(startu); - v = (INT64)(startv); - - for (; width != 0; width--) - { - bit = ((v >> nflatyshift) & nflatmask) | (u >> nflatxshift); - if (brightmap != NULL && brightmap[bit] == BRIGHTPIXEL) - { - *dest = *(transmap + (fullbright[source[bit]] << 8) + *dsrc); - } - else - { - colormap = planezlight[tiltlighting[ds->x1]] + (ds_colormap - colormaps); - *dest = *(transmap + (colormap[source[bit]] << 8) + *dsrc); - } - dest++; - ds->x1++; - dsrc++; - u += stepu; - v += stepv; - } - } - } -#endif - TracyCZoneEnd(__zone); -} - -void R_DrawTiltedSplat_8(drawspandata_t* ds) -{ - // x1, x2 = ds_x1, ds_x2 - int width = ds->x2 - ds->x1; - double iz, uz, vz; - UINT32 u, v; - int i; - - UINT8 *source; - UINT8 *brightmap; - UINT8 *colormap; - UINT8 *fullbright; - UINT8 *dest; - - UINT8 val; - - double startz, startu, startv; - double izstep, uzstep, vzstep; - double endz, endu, endv; - UINT32 stepu, stepv; - UINT32 bit; - INT32 tiltlighting[MAXVIDWIDTH]; - - iz = ds->szp.z + ds->szp.y*(centery-ds->y) + ds->szp.x*(ds->x1-centerx); - - // Lighting is simple. It's just linear interpolation from start to end - { - float planelightfloat = PLANELIGHTFLOAT; - float lightstart, lightend; - - lightend = (iz + ds->szp.x*width) * planelightfloat; - lightstart = iz * planelightfloat; - - R_CalcTiltedLighting(tiltlighting, ds->x1, ds->x2, FLOAT_TO_FIXED(lightstart), FLOAT_TO_FIXED(lightend)); - //CONS_Printf("tilted lighting %f to %f (foc %f)\n", lightstart, lightend, focallengthf); - } - - uz = ds->sup.z + ds->sup.y*(centery-ds->y) + ds->sup.x*(ds->x1-centerx); - vz = ds->svp.z + ds->svp.y*(centery-ds->y) + ds->svp.x*(ds->x1-centerx); - - dest = ylookup[ds->y] + columnofs[ds->x1]; - - source = ds->source; - brightmap = ds->brightmap; - //colormap = ds_colormap; - fullbright = ds->fullbright; - -#if 0 // The "perfect" reference version of this routine. Pretty slow. - // Use it only to see how things are supposed to look. - i = 0; - do - { - double z = 1.f/iz; - u = (INT64)(uz*z); - v = (INT64)(vz*z); - - bit = ((v >> nflatyshift) & nflatmask) | (u >> nflatxshift); - val = source[bit]; - if (val != TRANSPARENTPIXEL) - { - if (brightmap != NULL && brightmap[bit] == BRIGHTPIXEL) - { - *dest = fullbright[val]; - } - else - { - colormap = planezlight[tiltlighting[ds_x1]] + (ds_colormap - colormaps); - *dest = colormap[val]; - } - } - - dest++; - ds_x1++; - iz += ds_szp->x; - uz += ds_sup->x; - vz += ds_svp->x; - } while (--width >= 0); -#else - startz = 1.f/iz; - startu = uz*startz; - startv = vz*startz; - - izstep = ds->szp.x * SPANSIZE; - uzstep = ds->sup.x * SPANSIZE; - vzstep = ds->svp.x * SPANSIZE; - //x1 = 0; - width++; - - while (width >= SPANSIZE) - { - iz += izstep; - uz += uzstep; - vz += vzstep; - - endz = 1.f/iz; - endu = uz*endz; - endv = vz*endz; - stepu = (INT64)((endu - startu) * INVSPAN); - stepv = (INT64)((endv - startv) * INVSPAN); - u = (INT64)(startu); - v = (INT64)(startv); - - for (i = SPANSIZE-1; i >= 0; i--) - { - bit = ((v >> ds->nflatyshift) & ds->nflatmask) | (u >> ds->nflatxshift); - val = source[bit]; - if (val != TRANSPARENTPIXEL) - { - if (brightmap != NULL && brightmap[bit] == BRIGHTPIXEL) - { - *dest = fullbright[val]; - } - else - { - colormap = ds->planezlight[tiltlighting[ds->x1]] + (ds->colormap - colormaps); - *dest = colormap[val]; - } - } - dest++; - ds->x1++; - u += stepu; - v += stepv; - } - startu = endu; - startv = endv; - width -= SPANSIZE; - } - if (width > 0) - { - if (width == 1) - { - u = (INT64)(startu); - v = (INT64)(startv); - bit = ((v >> ds->nflatyshift) & ds->nflatmask) | (u >> ds->nflatxshift); - val = source[bit]; - if (val != TRANSPARENTPIXEL) - { - if (brightmap != NULL && brightmap[bit] == BRIGHTPIXEL) - { - *dest = fullbright[val]; - } - else - { - colormap = ds->planezlight[tiltlighting[ds->x1]] + (ds->colormap - colormaps); - *dest = colormap[val]; - } - ds->x1++; - } - } - else - { - double left = width; - iz += ds->szp.x * left; - uz += ds->sup.x * left; - vz += ds->svp.x * left; - - endz = 1.f/iz; - endu = uz*endz; - endv = vz*endz; - left = 1.f/left; - stepu = (INT64)((endu - startu) * left); - stepv = (INT64)((endv - startv) * left); - u = (INT64)(startu); - v = (INT64)(startv); - - for (; width != 0; width--) - { - bit = ((v >> ds->nflatyshift) & ds->nflatmask) | (u >> ds->nflatxshift); - val = source[bit]; - if (val != TRANSPARENTPIXEL) - { - if (brightmap != NULL && brightmap[bit] == BRIGHTPIXEL) - { - *dest = fullbright[val]; - } - else - { - colormap = ds->planezlight[tiltlighting[ds->x1]] + (ds->colormap - colormaps); - *dest = colormap[val]; - } - } - dest++; - ds->x1++; - u += stepu; - v += stepv; - } - } - } -#endif -} - -/** \brief The R_DrawSplat_8 function - Just like R_DrawSpan_8, but skips transparent pixels. -*/ -void R_DrawSplat_8 (drawspandata_t* ds) -{ - fixed_t xposition; - fixed_t yposition; - fixed_t xstep, ystep; - UINT32 bit; - - UINT8 *source; - UINT8 *brightmap; - UINT8 *colormap; - UINT8 *fullbright; - UINT8 *dest; - const UINT8 *deststop = screens[0] + vid.rowbytes * vid.height; - - size_t count = (ds->x2 - ds->x1 + 1); - size_t i; - UINT32 val; - - xposition = ds->xfrac; yposition = ds->yfrac; - xstep = ds->xstep; ystep = ds->ystep; - - // SoM: we only need 6 bits for the integer part (0 thru 63) so the rest - // can be used for the fraction part. This allows calculation of the memory address in the - // texture with two shifts, an OR and one AND. (see below) - // for texture sizes > 64 the amount of precision we can allow will decrease, but only by one - // bit per power of two (obviously) - // Ok, because I was able to eliminate the variable spot below, this function is now FASTER - // than the original span renderer. Whodathunkit? - xposition <<= ds->nflatshiftup; yposition <<= ds->nflatshiftup; - xstep <<= ds->nflatshiftup; ystep <<= ds->nflatshiftup; - - source = ds->source; - brightmap = ds->brightmap; - colormap = ds->colormap; - fullbright = ds->fullbright; - dest = ylookup[ds->y] + columnofs[ds->x1]; - - while (count >= 8) - { - // SoM: Why didn't I see this earlier? the spot variable is a waste now because we don't - // have the uber complicated math to calculate it now, so that was a memory write we didn't - // need! - for (i = 0; i < 8; i++) - { - bit = (((UINT32)yposition >> ds->nflatyshift) & ds->nflatmask) | ((UINT32)xposition >> ds->nflatxshift); - bit &= MAXFLATBYTES; - val = source[bit]; - if (val != TRANSPARENTPIXEL) - { - if (brightmap != NULL && brightmap[bit] == BRIGHTPIXEL) - { - dest[i] = fullbright[val]; - } - else - { - dest[i] = colormap[val]; - } - } - xposition += xstep; - yposition += ystep; - } - - dest += 8; - count -= 8; - } - while (count-- && dest <= deststop) - { - bit = (((UINT32)yposition >> ds->nflatyshift) & ds->nflatmask) | ((UINT32)xposition >> ds->nflatxshift); - val = source[bit]; - if (val != TRANSPARENTPIXEL) - { - if (brightmap != NULL && brightmap[bit] == BRIGHTPIXEL) - { - *dest = fullbright[val]; - } - else - { - *dest = colormap[val]; - } - } - dest++; - xposition += xstep; - yposition += ystep; - } -} - -/** \brief The R_DrawTranslucentSplat_8 function - Just like R_DrawSplat_8, but is translucent! -*/ -void R_DrawTranslucentSplat_8 (drawspandata_t* ds) -{ - fixed_t xposition; - fixed_t yposition; - fixed_t xstep, ystep; - UINT32 bit; - - UINT8 *source; - UINT8 *brightmap; - UINT8 *colormap; - UINT8 *fullbright; - UINT8 *dest; - const UINT8 *deststop = screens[0] + vid.rowbytes * vid.height; - - size_t count = (ds->x2 - ds->x1 + 1); - size_t i; - UINT32 val; - - xposition = ds->xfrac; yposition = ds->yfrac; - xstep = ds->xstep; ystep = ds->ystep; - - // SoM: we only need 6 bits for the integer part (0 thru 63) so the rest - // can be used for the fraction part. This allows calculation of the memory address in the - // texture with two shifts, an OR and one AND. (see below) - // for texture sizes > 64 the amount of precision we can allow will decrease, but only by one - // bit per power of two (obviously) - // Ok, because I was able to eliminate the variable spot below, this function is now FASTER - // than the original span renderer. Whodathunkit? - xposition <<= ds->nflatshiftup; yposition <<= ds->nflatshiftup; - xstep <<= ds->nflatshiftup; ystep <<= ds->nflatshiftup; - - source = ds->source; - brightmap = ds->brightmap; - colormap = ds->colormap; - fullbright = ds->fullbright; - dest = ylookup[ds->y] + columnofs[ds->x1]; - - while (count >= 8) - { - // SoM: Why didn't I see this earlier? the spot variable is a waste now because we don't - // have the uber complicated math to calculate it now, so that was a memory write we didn't - // need! - for (i = 0; i < 8; i++) - { - bit = (((UINT32)yposition >> ds->nflatyshift) & ds->nflatmask) | ((UINT32)xposition >> ds->nflatxshift); - val = source[bit]; - if (val != TRANSPARENTPIXEL) - { - if (brightmap != NULL && brightmap[bit] == BRIGHTPIXEL) - { - dest[i] = *(ds->transmap + (fullbright[val] << 8) + dest[i]); - } - else - { - dest[i] = *(ds->transmap + (colormap[val] << 8) + dest[i]); - } - - } - xposition += xstep; - yposition += ystep; - } - - dest += 8; - count -= 8; - } - while (count-- && dest <= deststop) - { - bit = (((UINT32)yposition >> ds->nflatyshift) & ds->nflatmask) | ((UINT32)xposition >> ds->nflatxshift); - val = source[bit]; - if (val != TRANSPARENTPIXEL) - { - if (brightmap != NULL && brightmap[bit] == BRIGHTPIXEL) - { - *dest = *(ds->transmap + (fullbright[val] << 8) + *dest); - } - else - { - *dest = *(ds->transmap + (colormap[val] << 8) + *dest); - } - - } - dest++; - xposition += xstep; - yposition += ystep; - } -} - -/** \brief The R_DrawFloorSprite_8 function - Just like R_DrawSplat_8, but for floor sprites. -*/ -void R_DrawFloorSprite_8 (drawspandata_t* ds) -{ - fixed_t xposition; - fixed_t yposition; - fixed_t xstep, ystep; - UINT32 bit; - - UINT16 *source; - UINT16 *brightmap; - UINT8 *colormap; - UINT8 *fullbright; - UINT8 *translation; - UINT8 *dest; - const UINT8 *deststop = screens[0] + vid.rowbytes * vid.height; - - size_t count = (ds->x2 - ds->x1 + 1); - size_t i; - UINT32 val; - - xposition = ds->xfrac; yposition = ds->yfrac; - xstep = ds->xstep; ystep = ds->ystep; - - // SoM: we only need 6 bits for the integer part (0 thru 63) so the rest - // can be used for the fraction part. This allows calculation of the memory address in the - // texture with two shifts, an OR and one AND. (see below) - // for texture sizes > 64 the amount of precision we can allow will decrease, but only by one - // bit per power of two (obviously) - // Ok, because I was able to eliminate the variable spot below, this function is now FASTER - // than the original span renderer. Whodathunkit? - xposition <<= ds->nflatshiftup; yposition <<= ds->nflatshiftup; - xstep <<= ds->nflatshiftup; ystep <<= ds->nflatshiftup; - - source = (UINT16 *)ds->source; - brightmap = (UINT16 *)ds->brightmap; - colormap = ds->colormap; - fullbright = ds->fullbright; - translation = ds->translation; - dest = ylookup[ds->y] + columnofs[ds->x1]; - - while (count >= 8) - { - // SoM: Why didn't I see this earlier? the spot variable is a waste now because we don't - // have the uber complicated math to calculate it now, so that was a memory write we didn't - // need! - for (i = 0; i < 8; i++) - { - bit = (((UINT32)yposition >> ds->nflatyshift) & ds->nflatmask) | ((UINT32)xposition >> ds->nflatxshift); - val = source[bit]; - if (val & 0xFF00) - { - if (brightmap != NULL && brightmap[bit] == BRIGHTPIXEL) - { - dest[i] = fullbright[translation[val & 0xFF]]; - } - else - { - dest[i] = colormap[translation[val & 0xFF]]; - } - } - xposition += xstep; - yposition += ystep; - } - - dest += 8; - count -= 8; - } - while (count-- && dest <= deststop) - { - bit = (((UINT32)yposition >> ds->nflatyshift) & ds->nflatmask) | ((UINT32)xposition >> ds->nflatxshift); - val = source[bit]; - if (val & 0xFF00) - { - if (brightmap != NULL && brightmap[bit] == BRIGHTPIXEL) - { - *dest = fullbright[translation[val & 0xFF]]; - } - else - { - *dest = colormap[translation[val & 0xFF]]; - } - } - dest++; - xposition += xstep; - yposition += ystep; - } -} - -/** \brief The R_DrawTranslucentFloorSplat_8 function - Just like R_DrawFloorSprite_8, but is translucent! -*/ -void R_DrawTranslucentFloorSprite_8 (drawspandata_t* ds) -{ - fixed_t xposition; - fixed_t yposition; - fixed_t xstep, ystep; - UINT32 bit; - - UINT16 *source; - UINT16 *brightmap; - UINT8 *colormap; - UINT8 *fullbright; - UINT8 *translation; - UINT8 *dest; - const UINT8 *deststop = screens[0] + vid.rowbytes * vid.height; - - size_t count = (ds->x2 - ds->x1 + 1); - size_t i; - UINT32 val; - - xposition = ds->xfrac; yposition = ds->yfrac; - xstep = ds->xstep; ystep = ds->ystep; - - // SoM: we only need 6 bits for the integer part (0 thru 63) so the rest - // can be used for the fraction part. This allows calculation of the memory address in the - // texture with two shifts, an OR and one AND. (see below) - // for texture sizes > 64 the amount of precision we can allow will decrease, but only by one - // bit per power of two (obviously) - // Ok, because I was able to eliminate the variable spot below, this function is now FASTER - // than the original span renderer. Whodathunkit? - xposition <<= ds->nflatshiftup; yposition <<= ds->nflatshiftup; - xstep <<= ds->nflatshiftup; ystep <<= ds->nflatshiftup; - - source = (UINT16 *)ds->source; - brightmap = (UINT16 *)ds->brightmap; - colormap = ds->colormap; - fullbright = ds->fullbright; - translation = ds->translation; - dest = ylookup[ds->y] + columnofs[ds->x1]; - - while (count >= 8) - { - // SoM: Why didn't I see this earlier? the spot variable is a waste now because we don't - // have the uber complicated math to calculate it now, so that was a memory write we didn't - // need! - for (i = 0; i < 8; i++) - { - bit = (((UINT32)yposition >> ds->nflatyshift) & ds->nflatmask) | ((UINT32)xposition >> ds->nflatxshift); - val = source[bit]; - if (val & 0xFF00) - { - if (brightmap != NULL && brightmap[bit] == BRIGHTPIXEL) - { - dest[i] = *(ds->transmap + (fullbright[translation[val & 0xFF]] << 8) + dest[i]); - } - else - { - dest[i] = *(ds->transmap + (colormap[translation[val & 0xFF]] << 8) + dest[i]); - } - } - xposition += xstep; - yposition += ystep; - } - - dest += 8; - count -= 8; - } - while (count-- && dest <= deststop) - { - bit = (((UINT32)yposition >> ds->nflatyshift) & ds->nflatmask) | ((UINT32)xposition >> ds->nflatxshift); - val = source[bit]; - if (val & 0xFF00) - { - if (brightmap != NULL && brightmap[bit] == BRIGHTPIXEL) - { - *dest = *(ds->transmap + (fullbright[translation[val & 0xFF]] << 8) + *dest); - } - else - { - *dest = *(ds->transmap + (colormap[translation[val & 0xFF]] << 8) + *dest); - } - } - dest++; - xposition += xstep; - yposition += ystep; - } -} - -/** \brief The R_DrawTiltedFloorSprite_8 function - Draws a tilted floor sprite. -*/ -void R_DrawTiltedFloorSprite_8(drawspandata_t* ds) -{ - // x1, x2 = ds_x1, ds_x2 - int width = ds->x2 - ds->x1; - double iz, uz, vz; - UINT32 u, v; - int i; - - UINT16 *source; - UINT16 *brightmap; - UINT8 *colormap; - UINT8 *fullbright; - UINT8 *translation; - UINT8 *dest; - UINT16 val; - - double startz, startu, startv; - double izstep, uzstep, vzstep; - double endz, endu, endv; - UINT32 stepu, stepv; - UINT32 bit; - - iz = ds->szp.z + ds->szp.y*(centery-ds->y) + ds->szp.x*(ds->x1-centerx); - uz = ds->sup.z + ds->sup.y*(centery-ds->y) + ds->sup.x*(ds->x1-centerx); - vz = ds->svp.z + ds->svp.y*(centery-ds->y) + ds->svp.x*(ds->x1-centerx); - - dest = ylookup[ds->y] + columnofs[ds->x1]; - source = (UINT16 *)ds->source; - brightmap = (UINT16 *)ds->brightmap; - colormap = ds->colormap; - fullbright = ds->fullbright; - translation = ds->translation; - - startz = 1.f/iz; - startu = uz*startz; - startv = vz*startz; - - izstep = ds->szp.x * SPANSIZE; - uzstep = ds->sup.x * SPANSIZE; - vzstep = ds->svp.x * SPANSIZE; - //x1 = 0; - width++; - - while (width >= SPANSIZE) - { - iz += izstep; - uz += uzstep; - vz += vzstep; - - endz = 1.f/iz; - endu = uz*endz; - endv = vz*endz; - stepu = (INT64)((endu - startu) * INVSPAN); - stepv = (INT64)((endv - startv) * INVSPAN); - u = (INT64)(startu); - v = (INT64)(startv); - - for (i = SPANSIZE-1; i >= 0; i--) - { - bit = ((v >> ds->nflatyshift) & ds->nflatmask) | (u >> ds->nflatxshift); - val = source[bit]; - if (val & 0xFF00) - { - if (brightmap != NULL && brightmap[bit] == BRIGHTPIXEL) - { - *dest = fullbright[translation[val & 0xFF]]; - } - else - { - *dest = colormap[translation[val & 0xFF]]; - } - } - dest++; - - u += stepu; - v += stepv; - } - startu = endu; - startv = endv; - width -= SPANSIZE; - } - if (width > 0) - { - if (width == 1) - { - u = (INT64)(startu); - v = (INT64)(startv); - bit = ((v >> ds->nflatyshift) & ds->nflatmask) | (u >> ds->nflatxshift); - val = source[bit]; - if (val & 0xFF00) - { - if (brightmap != NULL && brightmap[bit] == BRIGHTPIXEL) - { - *dest = fullbright[translation[val & 0xFF]]; - } - else - { - *dest = colormap[translation[val & 0xFF]]; - } - } - } - else - { - double left = width; - iz += ds->szp.x * left; - uz += ds->sup.x * left; - vz += ds->svp.x * left; - - endz = 1.f/iz; - endu = uz*endz; - endv = vz*endz; - left = 1.f/left; - stepu = (INT64)((endu - startu) * left); - stepv = (INT64)((endv - startv) * left); - u = (INT64)(startu); - v = (INT64)(startv); - - for (; width != 0; width--) - { - bit = ((v >> ds->nflatyshift) & ds->nflatmask) | (u >> ds->nflatxshift); - val = source[bit]; - if (val & 0xFF00) - { - if (brightmap != NULL && brightmap[bit] == BRIGHTPIXEL) - { - *dest = fullbright[translation[val & 0xFF]]; - } - else - { - *dest = colormap[translation[val & 0xFF]]; - } - } - dest++; - - u += stepu; - v += stepv; - } - } - } -} - -/** \brief The R_DrawTiltedTranslucentFloorSprite_8 function - Draws a tilted, translucent, floor sprite. -*/ -void R_DrawTiltedTranslucentFloorSprite_8(drawspandata_t* ds) -{ - // x1, x2 = ds_x1, ds_x2 - int width = ds->x2 - ds->x1; - double iz, uz, vz; - UINT32 u, v; - int i; - - UINT16 *source; - UINT16 *brightmap; - UINT8 *colormap; - UINT8 *fullbright; - UINT8 *translation; - UINT8 *dest; - UINT16 val; - - double startz, startu, startv; - double izstep, uzstep, vzstep; - double endz, endu, endv; - UINT32 stepu, stepv; - UINT32 bit; - - const INT32 nflatxshift = ds->nflatxshift; - const INT32 nflatyshift = ds->nflatyshift; - const INT32 nflatmask = ds->nflatmask; - UINT8 *transmap = ds->transmap; - - iz = ds->szp.z + ds->szp.y*(centery-ds->y) + ds->szp.x*(ds->x1-centerx); - uz = ds->sup.z + ds->sup.y*(centery-ds->y) + ds->sup.x*(ds->x1-centerx); - vz = ds->svp.z + ds->svp.y*(centery-ds->y) + ds->svp.x*(ds->x1-centerx); - - dest = ylookup[ds->y] + columnofs[ds->x1]; - source = (UINT16 *)ds->source; - brightmap = (UINT16 *)ds->brightmap; - colormap = ds->colormap; - fullbright = ds->fullbright; - translation = ds->translation; - - startz = 1.f/iz; - startu = uz*startz; - startv = vz*startz; - - izstep = ds->szp.x * SPANSIZE; - uzstep = ds->sup.x * SPANSIZE; - vzstep = ds->svp.x * SPANSIZE; - //x1 = 0; - width++; - - while (width >= SPANSIZE) - { - iz += izstep; - uz += uzstep; - vz += vzstep; - - endz = 1.f/iz; - endu = uz*endz; - endv = vz*endz; - stepu = (INT64)((endu - startu) * INVSPAN); - stepv = (INT64)((endv - startv) * INVSPAN); - u = (INT64)(startu); - v = (INT64)(startv); - - for (i = 0; i < SPANSIZE; i++) - { - bit = (((v + stepv * i) >> nflatyshift) & nflatmask) | ((u + stepu * i) >> nflatxshift); - val = source[bit]; - if (val & 0xFF00) - { - if (brightmap != NULL && brightmap[bit] == BRIGHTPIXEL) - { - dest[i] = *(transmap + (fullbright[translation[val & 0xFF]] << 8) + dest[i]); - } - else - { - dest[i] = *(transmap + (colormap[translation[val & 0xFF]] << 8) + dest[i]); - } - } - } - startu = endu; - startv = endv; - width -= SPANSIZE; - } - if (width > 0) - { - if (width == 1) - { - u = (INT64)(startu); - v = (INT64)(startv); - bit = ((v >> nflatyshift) & nflatmask) | (u >> nflatxshift); - val = source[bit]; - if (val & 0xFF00) - { - if (brightmap != NULL && brightmap[bit] == BRIGHTPIXEL) - { - *dest = *(transmap + (fullbright[translation[val & 0xFF]] << 8) + *dest); - } - else - { - *dest = *(transmap + (colormap[translation[val & 0xFF]] << 8) + *dest); - } - } - } - else - { - double left = width; - iz += ds->szp.x * left; - uz += ds->sup.x * left; - vz += ds->svp.x * left; - - endz = 1.f/iz; - endu = uz*endz; - endv = vz*endz; - left = 1.f/left; - stepu = (INT64)((endu - startu) * left); - stepv = (INT64)((endv - startv) * left); - u = (INT64)(startu); - v = (INT64)(startv); - - for (; width != 0; width--) - { - bit = ((v >> nflatyshift) & nflatmask) | (u >> nflatxshift); - val = source[bit]; - if (val & 0xFF00) - { - if (brightmap != NULL && brightmap[bit] == BRIGHTPIXEL) - { - *dest = *(transmap + (fullbright[translation[val & 0xFF]] << 8) + *dest); - } - else - { - *dest = *(transmap + (colormap[translation[val & 0xFF]] << 8) + *dest); - } - } - dest++; - - u += stepu; - v += stepv; - } - } - } -} - -/** \brief The R_DrawTranslucentSpan_8 function - Draws the actual span with translucency. -*/ -void R_DrawTranslucentSpan_8 (drawspandata_t* ds) -{ - fixed_t xposition; - fixed_t yposition; - fixed_t xstep, ystep; - UINT32 bit; - - UINT8 *source; - UINT8 *brightmap; - UINT8 *colormap; - UINT8 *fullbright; - UINT8 *dest; - const UINT8 *deststop = screens[0] + vid.rowbytes * vid.height; - - size_t count = (ds->x2 - ds->x1 + 1); - size_t i; - - xposition = ds->xfrac; yposition = ds->yfrac; - xstep = ds->xstep; ystep = ds->ystep; - - // SoM: we only need 6 bits for the integer part (0 thru 63) so the rest - // can be used for the fraction part. This allows calculation of the memory address in the - // texture with two shifts, an OR and one AND. (see below) - // for texture sizes > 64 the amount of precision we can allow will decrease, but only by one - // bit per power of two (obviously) - // Ok, because I was able to eliminate the variable spot below, this function is now FASTER - // than the original span renderer. Whodathunkit? - xposition <<= ds->nflatshiftup; yposition <<= ds->nflatshiftup; - xstep <<= ds->nflatshiftup; ystep <<= ds->nflatshiftup; - - source = ds->source; - brightmap = ds->brightmap; - colormap = ds->colormap; - fullbright = ds->fullbright; - dest = ylookup[ds->y] + columnofs[ds->x1]; - - while (count >= 8) - { - // SoM: Why didn't I see this earlier? the spot variable is a waste now because we don't - // have the uber complicated math to calculate it now, so that was a memory write we didn't - // need! - for (i = 0; i < 8; i++) - { - bit = (((UINT32)yposition >> ds->nflatyshift) & ds->nflatmask) | ((UINT32)xposition >> ds->nflatxshift); - if (brightmap != NULL && brightmap[bit] == BRIGHTPIXEL) - { - dest[i] = *(ds->transmap + (fullbright[source[bit]] << 8) + dest[i]); - } - else - { - dest[i] = *(ds->transmap + (colormap[source[bit]] << 8) + dest[i]); - } - xposition += xstep; - yposition += ystep; - } - - dest += 8; - count -= 8; - } - while (count-- && dest <= deststop) - { - bit = (((UINT32)yposition >> ds->nflatyshift) & ds->nflatmask) | ((UINT32)xposition >> ds->nflatxshift); - if (brightmap != NULL && brightmap[bit] == BRIGHTPIXEL) - { - *dest = *(ds->transmap + (fullbright[source[bit]] << 8) + *dest); - } - else - { - *dest = *(ds->transmap + (colormap[source[bit]] << 8) + *dest); - } - dest++; - xposition += xstep; - yposition += ystep; - } -} - -void R_DrawTranslucentWaterSpan_8(drawspandata_t* ds) -{ - UINT32 xposition; - UINT32 yposition; - UINT32 xstep, ystep; - UINT32 bit; - - UINT8 *source; - UINT8 *brightmap; - UINT8 *colormap; - UINT8 *fullbright; - UINT8 *dest; - UINT8 *dsrc; - - size_t count; - size_t i; - - // SoM: we only need 6 bits for the integer part (0 thru 63) so the rest - // can be used for the fraction part. This allows calculation of the memory address in the - // texture with two shifts, an OR and one AND. (see below) - // for texture sizes > 64 the amount of precision we can allow will decrease, but only by one - // bit per power of two (obviously) - // Ok, because I was able to eliminate the variable spot below, this function is now FASTER - // than the original span renderer. Whodathunkit? - xposition = ds->xfrac << ds->nflatshiftup; yposition = (ds->yfrac + ds->waterofs) << ds->nflatshiftup; - xstep = ds->xstep << ds->nflatshiftup; ystep = ds->ystep << ds->nflatshiftup; - - source = ds->source; - brightmap = ds->brightmap; - colormap = ds->colormap; - fullbright = ds->fullbright; - dest = ylookup[ds->y] + columnofs[ds->x1]; - dsrc = screens[1] + (ds->y+ds->bgofs)*vid.width + ds->x1; - count = ds->x2 - ds->x1 + 1; - - while (count >= 8) - { - // SoM: Why didn't I see this earlier? the spot variable is a waste now because we don't - // have the uber complicated math to calculate it now, so that was a memory write we didn't - // need! - for (i = 0; i < 8; i++) - { - bit = ((yposition >> ds->nflatyshift) & ds->nflatmask) | (xposition >> ds->nflatxshift); - if (brightmap != NULL && brightmap[bit] == BRIGHTPIXEL) - { - dest[i] = fullbright[*(ds->transmap + (source[bit] << 8) + dsrc[i])]; - } - else - { - dest[i] = colormap[*(ds->transmap + (source[bit] << 8) + dsrc[i])]; - } - xposition += xstep; - yposition += ystep; - } - - dest += 8; - dsrc += 8; - count -= 8; - } - while (count--) - { - bit = ((yposition >> ds->nflatyshift) & ds->nflatmask) | (xposition >> ds->nflatxshift); - if (brightmap != NULL && brightmap[bit] == BRIGHTPIXEL) - { - *dest = fullbright[*(ds->transmap + (source[bit] << 8) + *dsrc)]; - } - else - { - *dest = colormap[*(ds->transmap + (source[bit] << 8) + *dsrc)]; - } - dest++; - dsrc++; - xposition += xstep; - yposition += ystep; - } -} - -/** \brief The R_DrawFogSpan_8 function - Draws the actual span with fogging. -*/ -void R_DrawFogSpan_8(drawspandata_t* ds) -{ - UINT8 *colormap; - UINT8 *dest; - - size_t count; - - colormap = ds->colormap; - //dest = ylookup[ds_y] + columnofs[ds_x1]; - dest = &topleft[ds->y *vid.width + ds->x1]; - - count = ds->x2 - ds->x1 + 1; - - while (count >= 4) - { - dest[0] = colormap[dest[0]]; - dest[1] = colormap[dest[1]]; - dest[2] = colormap[dest[2]]; - dest[3] = colormap[dest[3]]; - - dest += 4; - count -= 4; - } - - while (count--) - { - *dest = colormap[*dest]; - dest++; - } -} - -/** \brief The R_DrawFogColumn_8 function - Fog wall. -*/ -void R_DrawFogColumn_8(drawcolumndata_t* dc) -{ - INT32 count; - UINT8 *dest; - - count = dc->yh - dc->yl; - - // Zero length, column does not exceed a pixel. - if (count < 0) - return; - -#ifdef RANGECHECK - if ((unsigned)dc->x >= (unsigned)vid.width || dc->yl < 0 || dc->yh >= vid.height) - I_Error("R_DrawFogColumn_8: %d to %d at %d", dc->yl, dc->yh, dc->x); -#endif - - // Framebuffer destination address. - // Use ylookup LUT to avoid multiply with ScreenWidth. - // Use columnofs LUT for subwindows? - //dest = ylookup[dc_yl] + columnofs[dc_x]; - dest = &topleft[dc->yl*vid.width + dc->x]; - - // Determine scaling, which is the only mapping to be done. - do - { - // Simple. Apply the colormap to what's already on the screen. - *dest = dc->colormap[*dest]; - dest += vid.width; - } while (count--); -} - -/** \brief The R_DrawShadeColumn_8 function - This is for 3D floors that cast shadows on walls. - - This function just cuts the column up into sections and calls R_DrawColumn_8 -*/ -void R_DrawColumnShadowed_8(drawcolumndata_t* dc) -{ - INT32 count, realyh, i, height, bheight = 0, solid = 0; - - realyh = dc->yh; - - count = dc->yh - dc->yl; - - // Zero length, column does not exceed a pixel. - if (count < 0) - return; - -#ifdef RANGECHECK - if ((unsigned)dc->x >= (unsigned)vid.width || dc->yl < 0 || dc->yh >= vid.height) - I_Error("R_DrawColumnShadowed_8: %d to %d at %d", dc->yl, dc->yh, dc->x); -#endif - - // This runs through the lightlist from top to bottom and cuts up the column accordingly. - for (i = 0; i < dc->numlights; i++) - { - // If the height of the light is above the column, get the colormap - // anyway because the lighting of the top should be affected. - solid = dc->lightlist[i].flags & FOF_CUTSOLIDS; - - height = dc->lightlist[i].height >> LIGHTSCALESHIFT; - if (solid) - { - bheight = dc->lightlist[i].botheight >> LIGHTSCALESHIFT; - if (bheight < height) - { - // confounded slopes sometimes allow partial invertedness, - // even including cases where the top and bottom heights - // should actually be the same! - // swap the height values as a workaround for this quirk - INT32 temp = height; - height = bheight; - bheight = temp; - } - } - if (height <= dc->yl) - { - dc->colormap = dc->lightlist[i].rcolormap; - dc->fullbright = colormaps; - if (encoremap) - { - dc->colormap += COLORMAP_REMAPOFFSET; - dc->fullbright += COLORMAP_REMAPOFFSET; - } - if (solid && dc->yl < bheight) - dc->yl = bheight; - continue; - } - // Found a break in the column! - dc->yh = height; - - if (dc->yh > realyh) - dc->yh = realyh; - (colfuncs[BASEDRAWFUNC])(dc); // R_DrawColumn_8 for the appropriate architecture - if (solid) - dc->yl = bheight; - else - dc->yl = dc->yh + 1; - - dc->colormap = dc->lightlist[i].rcolormap; - dc->fullbright = colormaps; - if (encoremap) - { - dc->colormap += COLORMAP_REMAPOFFSET; - dc->fullbright += COLORMAP_REMAPOFFSET; - } - } - dc->yh = realyh; - if (dc->yl <= realyh) - (colfuncs[BASEDRAWFUNC])(dc); // R_DrawWallColumn_8 for the appropriate architecture -} diff --git a/src/r_draw8_flat.c b/src/r_draw8_flat.c deleted file mode 100644 index f6669b069..000000000 --- a/src/r_draw8_flat.c +++ /dev/null @@ -1,80 +0,0 @@ -// SONIC ROBO BLAST 2 -//----------------------------------------------------------------------------- -// Copyright (C) 1998-2000 by DooM Legacy Team. -// Copyright (C) 1999-2020 by Sonic Team Junior. -// Copyright (C) 2023 by Kart Krew. -// -// This program is free software distributed under the -// terms of the GNU General Public License, version 2. -// See the 'LICENSE' file for more details. -//----------------------------------------------------------------------------- -/// \file r_draw8_flat.c -/// \brief 8bpp span/column drawer functions for debugging (draws in flat colors only) -/// \note no includes because this is included as part of r_draw.c - -void R_DrawColumn_Flat_8 (drawcolumndata_t* dc) -{ - INT32 count; - UINT8 color = dc->lightmap[dc->r8_flatcolor]; - register UINT8 *dest; - - count = dc->yh - dc->yl; - - if (count < 0) // Zero length, column does not exceed a pixel. - return; - -#ifdef RANGECHECK - if ((unsigned)dc->x >= (unsigned)vid.width || dc->yl < 0 || dc->yh >= vid.height) - return; -#endif - - // Framebuffer destination address. - // Use ylookup LUT to avoid multiply with ScreenWidth. - // Use columnofs LUT for subwindows? - - //dest = ylookup[dc_yl] + columnofs[dc_x]; - dest = &topleft[dc->yl*vid.width + dc->x]; - - count++; - - do - { - *dest = color; - dest += vid.width; - } while (--count); -} - -void R_DrawSpan_Flat_8 (drawspandata_t* ds) -{ - UINT8 *dest = ylookup[ds->y] + columnofs[ds->x1]; - - memset(dest, ds->colormap[ds->r8_flatcolor], (ds->x2 - ds->x1) + 1); -} - -void R_DrawTiltedSpan_Flat_8 (drawspandata_t* ds) -{ - // x1, x2 = ds_x1, ds_x2 - int width = ds->x2 - ds->x1; - double iz = ds->szp.z + ds->szp.y*(centery-ds->y) + ds->szp.x*(ds->x1-centerx); - INT32 tiltlighting[MAXVIDWIDTH]; - - UINT8 *dest = ylookup[ds->y]; - - // Lighting is simple. It's just linear interpolation from start to end - { - float planelightfloat = PLANELIGHTFLOAT; - float lightstart, lightend; - - lightend = (iz + ds->szp.x*width) * planelightfloat; - lightstart = iz * planelightfloat; - - R_CalcTiltedLighting(tiltlighting, ds->x1, ds->x2, FLOAT_TO_FIXED(lightstart), FLOAT_TO_FIXED(lightend)); - //CONS_Printf("tilted lighting %f to %f (foc %f)\n", lightstart, lightend, focallengthf); - } - - while (ds->x1 <= ds->x2) - { - dest[ds->x1] = ds->planezlight[tiltlighting[ds->x1]][ds->r8_flatcolor]; - ds->x1++; - } -} diff --git a/src/r_draw8_npo2.c b/src/r_draw8_npo2.c deleted file mode 100644 index 07adefdfe..000000000 --- a/src/r_draw8_npo2.c +++ /dev/null @@ -1,1618 +0,0 @@ -// SONIC ROBO BLAST 2 -//----------------------------------------------------------------------------- -// Copyright (C) 1998-2000 by DooM Legacy Team. -// Copyright (C) 1999-2020 by Sonic Team Junior. -// -// This program is free software distributed under the -// terms of the GNU General Public License, version 2. -// See the 'LICENSE' file for more details. -//----------------------------------------------------------------------------- -/// \file r_draw8_npo2.c -/// \brief 8bpp span drawer functions (for non-powers-of-two flat dimensions) -/// \note no includes because this is included as part of r_draw.c - -// ========================================================================== -// SPANS -// ========================================================================== - -#define SPANSIZE 16 -#define INVSPAN 0.0625f - -/** \brief The R_DrawSpan_NPO2_8 function - Draws the actual span. -*/ -void R_DrawSpan_NPO2_8 (drawspandata_t* ds) -{ - fixed_t xposition; - fixed_t yposition; - fixed_t xstep, ystep; - fixed_t x, y; - fixed_t fixedwidth, fixedheight; - - UINT8 *source; - UINT8 *colormap; - UINT8 *dest; - const UINT8 *deststop = screens[0] + vid.rowbytes * vid.height; - - size_t count = (ds->x2 - ds->x1 + 1); - - xposition = ds->xfrac; yposition = ds->yfrac; - xstep = ds->xstep; ystep = ds->ystep; - - source = ds->source; - colormap = ds->colormap; - dest = ylookup[ds->y] + columnofs[ds->x1]; - - if (dest+8 > deststop) - return; - - fixedwidth = ds->flatwidth << FRACBITS; - fixedheight = ds->flatheight << FRACBITS; - - // Fix xposition and yposition if they are out of bounds. - if (xposition < 0) - xposition = fixedwidth - ((UINT32)(fixedwidth - xposition) % fixedwidth); - else if (xposition >= fixedwidth) - xposition %= fixedwidth; - if (yposition < 0) - yposition = fixedheight - ((UINT32)(fixedheight - yposition) % fixedheight); - else if (yposition >= fixedheight) - yposition %= fixedheight; - - while (count-- && dest <= deststop) - { - // The loops here keep the texture coordinates within the texture. - // They will rarely iterate multiple times, and are cheaper than a modulo operation, - // even if using libdivide. - if (xstep < 0) // These if statements are hopefully hoisted by the compiler to above this loop - while (xposition < 0) - xposition += fixedwidth; - else - while (xposition >= fixedwidth) - xposition -= fixedwidth; - if (ystep < 0) - while (yposition < 0) - yposition += fixedheight; - else - while (yposition >= fixedheight) - yposition -= fixedheight; - - x = (xposition >> FRACBITS); - y = (yposition >> FRACBITS); - - *dest++ = colormap[source[((y * ds->flatwidth) + x)]]; - xposition += xstep; - yposition += ystep; - } -} - -/** \brief The R_DrawTiltedSpan_NPO2_8 function - Draw slopes! Holy sheit! -*/ -void R_DrawTiltedSpan_NPO2_8(drawspandata_t* ds) -{ - // x1, x2 = ds_x1, ds_x2 - int width = ds->x2 - ds->x1; - double iz, uz, vz; - UINT32 u, v; - int i; - - UINT8 *source; - UINT8 *colormap; - UINT8 *dest; - - double startz, startu, startv; - double izstep, uzstep, vzstep; - double endz, endu, endv; - UINT32 stepu, stepv; - INT32 tiltlighting[MAXVIDWIDTH]; - - struct libdivide_u32_t x_divider = libdivide_u32_gen(ds->flatwidth); - struct libdivide_u32_t y_divider = libdivide_u32_gen(ds->flatheight); - - iz = ds->szp.z + ds->szp.y*(centery-ds->y) + ds->szp.x*(ds->x1-centerx); - - // Lighting is simple. It's just linear interpolation from start to end - { - float planelightfloat = PLANELIGHTFLOAT; - float lightstart, lightend; - - lightend = (iz + ds->szp.x*width) * planelightfloat; - lightstart = iz * planelightfloat; - - R_CalcTiltedLighting(tiltlighting, ds->x1, ds->x2, FLOAT_TO_FIXED(lightstart), FLOAT_TO_FIXED(lightend)); - //CONS_Printf("tilted lighting %f to %f (foc %f)\n", lightstart, lightend, focallengthf); - } - - uz = ds->sup.z + ds->sup.y*(centery-ds->y) + ds->sup.x*(ds->x1-centerx); - vz = ds->svp.z + ds->svp.y*(centery-ds->y) + ds->svp.x*(ds->x1-centerx); - - dest = ylookup[ds->y] + columnofs[ds->x1]; - source = ds->source; - //colormap = ds_colormap; - -#if 0 // The "perfect" reference version of this routine. Pretty slow. - // Use it only to see how things are supposed to look. - i = 0; - do - { - double z = 1.f/iz; - u = (INT64)(uz*z); - v = (INT64)(vz*z); - - colormap = planezlight[tiltlighting[ds_x1++]] + (ds_colormap - colormaps); - - // Lactozilla: Non-powers-of-two - { - fixed_t x = (((fixed_t)u) >> FRACBITS); - fixed_t y = (((fixed_t)v) >> FRACBITS); - - // Carefully align all of my Friends. - if (x < 0) - x += (libdivide_u32_do((UINT32)(-x-1), &x_divider) + 1) * ds_flatwidth; - else - x -= libdivide_u32_do((UINT32)x, &x_divider) * ds_flatwidth; - if (y < 0) - y += (libdivide_u32_do((UINT32)(-y-1), &y_divider) + 1) * ds_flatheight; - else - y -= libdivide_u32_do((UINT32)y, &y_divider) * ds_flatheight; - - *dest = colormap[source[((y * ds_flatwidth) + x)]]; - } - dest++; - iz += ds_szp->x; - uz += ds_sup->x; - vz += ds_svp->x; - } while (--width >= 0); -#else - startz = 1.f/iz; - startu = uz*startz; - startv = vz*startz; - - izstep = ds->szp.x * SPANSIZE; - uzstep = ds->sup.x * SPANSIZE; - vzstep = ds->svp.x * SPANSIZE; - //x1 = 0; - width++; - - while (width >= SPANSIZE) - { - iz += izstep; - uz += uzstep; - vz += vzstep; - - endz = 1.f/iz; - endu = uz*endz; - endv = vz*endz; - stepu = (INT64)((endu - startu) * INVSPAN); - stepv = (INT64)((endv - startv) * INVSPAN); - u = (INT64)(startu); - v = (INT64)(startv); - - for (i = SPANSIZE-1; i >= 0; i--) - { - colormap = ds->planezlight[tiltlighting[ds->x1++]] + (ds->colormap - colormaps); - // Lactozilla: Non-powers-of-two - { - fixed_t x = (((fixed_t)u) >> FRACBITS); - fixed_t y = (((fixed_t)v) >> FRACBITS); - - // Carefully align all of my Friends. - if (x < 0) - x += (libdivide_u32_do((UINT32)(-x-1), &x_divider) + 1) * ds->flatwidth; - else - x -= libdivide_u32_do((UINT32)x, &x_divider) * ds->flatwidth; - if (y < 0) - y += (libdivide_u32_do((UINT32)(-y-1), &y_divider) + 1) * ds->flatheight; - else - y -= libdivide_u32_do((UINT32)y, &y_divider) * ds->flatheight; - - *dest = colormap[source[((y * ds->flatwidth) + x)]]; - } - dest++; - u += stepu; - v += stepv; - } - startu = endu; - startv = endv; - width -= SPANSIZE; - } - if (width > 0) - { - if (width == 1) - { - u = (INT64)(startu); - v = (INT64)(startv); - colormap = ds->planezlight[tiltlighting[ds->x1++]] + (ds->colormap - colormaps); - // Lactozilla: Non-powers-of-two - { - fixed_t x = (((fixed_t)u) >> FRACBITS); - fixed_t y = (((fixed_t)v) >> FRACBITS); - - // Carefully align all of my Friends. - if (x < 0) - x += (libdivide_u32_do((UINT32)(-x-1), &x_divider) + 1) * ds->flatwidth; - else - x -= libdivide_u32_do((UINT32)x, &x_divider) * ds->flatwidth; - if (y < 0) - y += (libdivide_u32_do((UINT32)(-y-1), &y_divider) + 1) * ds->flatheight; - else - y -= libdivide_u32_do((UINT32)y, &y_divider) * ds->flatheight; - - *dest = colormap[source[((y * ds->flatwidth) + x)]]; - } - } - else - { - double left = width; - iz += ds->szp.x * left; - uz += ds->sup.x * left; - vz += ds->svp.x * left; - - endz = 1.f/iz; - endu = uz*endz; - endv = vz*endz; - left = 1.f/left; - stepu = (INT64)((endu - startu) * left); - stepv = (INT64)((endv - startv) * left); - u = (INT64)(startu); - v = (INT64)(startv); - - for (; width != 0; width--) - { - colormap = ds->planezlight[tiltlighting[ds->x1++]] + (ds->colormap - colormaps); - // Lactozilla: Non-powers-of-two - { - fixed_t x = (((fixed_t)u) >> FRACBITS); - fixed_t y = (((fixed_t)v) >> FRACBITS); - - // Carefully align all of my Friends. - if (x < 0) - x += (libdivide_u32_do((UINT32)(-x-1), &x_divider) + 1) * ds->flatwidth; - else - x -= libdivide_u32_do((UINT32)x, &x_divider) * ds->flatwidth; - if (y < 0) - y += (libdivide_u32_do((UINT32)(-y-1), &y_divider) + 1) * ds->flatheight; - else - y -= libdivide_u32_do((UINT32)y, &y_divider) * ds->flatheight; - - *dest = colormap[source[((y * ds->flatwidth) + x)]]; - } - dest++; - u += stepu; - v += stepv; - } - } - } -#endif -} - -/** \brief The R_DrawTiltedTranslucentSpan_NPO2_8 function - Like DrawTiltedSpan_NPO2, but translucent -*/ -void R_DrawTiltedTranslucentSpan_NPO2_8(drawspandata_t* ds) -{ - // x1, x2 = ds_x1, ds_x2 - int width = ds->x2 - ds->x1; - double iz, uz, vz; - UINT32 u, v; - int i; - - UINT8 *source; - UINT8 *colormap; - UINT8 *dest; - - double startz, startu, startv; - double izstep, uzstep, vzstep; - double endz, endu, endv; - UINT32 stepu, stepv; - INT32 tiltlighting[MAXVIDWIDTH]; - - struct libdivide_u32_t x_divider = libdivide_u32_gen(ds->flatwidth); - struct libdivide_u32_t y_divider = libdivide_u32_gen(ds->flatheight); - - iz = ds->szp.z + ds->szp.y*(centery-ds->y) + ds->szp.x*(ds->x1-centerx); - - // Lighting is simple. It's just linear interpolation from start to end - { - float planelightfloat = PLANELIGHTFLOAT; - float lightstart, lightend; - - lightend = (iz + ds->szp.x*width) * planelightfloat; - lightstart = iz * planelightfloat; - - R_CalcTiltedLighting(tiltlighting, ds->x1, ds->x2, FLOAT_TO_FIXED(lightstart), FLOAT_TO_FIXED(lightend)); - //CONS_Printf("tilted lighting %f to %f (foc %f)\n", lightstart, lightend, focallengthf); - } - - uz = ds->sup.z + ds->sup.y*(centery-ds->y) + ds->sup.x*(ds->x1-centerx); - vz = ds->svp.z + ds->svp.y*(centery-ds->y) + ds->svp.x*(ds->x1-centerx); - - dest = ylookup[ds->y] + columnofs[ds->x1]; - source = ds->source; - //colormap = ds_colormap; - -#if 0 // The "perfect" reference version of this routine. Pretty slow. - // Use it only to see how things are supposed to look. - i = 0; - do - { - double z = 1.f/iz; - u = (INT64)(uz*z); - v = (INT64)(vz*z); - - colormap = planezlight[tiltlighting[ds_x1++]] + (ds_colormap - colormaps); - // Lactozilla: Non-powers-of-two - { - fixed_t x = (((fixed_t)u) >> FRACBITS); - fixed_t y = (((fixed_t)v) >> FRACBITS); - - // Carefully align all of my Friends. - if (x < 0) - x += (libdivide_u32_do((UINT32)(-x-1), &x_divider) + 1) * ds_flatwidth; - else - x -= libdivide_u32_do((UINT32)x, &x_divider) * ds_flatwidth; - if (y < 0) - y += (libdivide_u32_do((UINT32)(-y-1), &y_divider) + 1) * ds_flatheight; - else - y -= libdivide_u32_do((UINT32)y, &y_divider) * ds_flatheight; - - *dest = *(ds_transmap + (colormap[source[((y * ds_flatwidth) + x)]] << 8) + *dest); - } - dest++; - iz += ds_szp->x; - uz += ds_sup->x; - vz += ds_svp->x; - } while (--width >= 0); -#else - startz = 1.f/iz; - startu = uz*startz; - startv = vz*startz; - - izstep = ds->szp.x * SPANSIZE; - uzstep = ds->sup.x * SPANSIZE; - vzstep = ds->svp.x * SPANSIZE; - //x1 = 0; - width++; - - while (width >= SPANSIZE) - { - iz += izstep; - uz += uzstep; - vz += vzstep; - - endz = 1.f/iz; - endu = uz*endz; - endv = vz*endz; - stepu = (INT64)((endu - startu) * INVSPAN); - stepv = (INT64)((endv - startv) * INVSPAN); - u = (INT64)(startu); - v = (INT64)(startv); - - for (i = SPANSIZE-1; i >= 0; i--) - { - colormap = ds->planezlight[tiltlighting[ds->x1++]] + (ds->colormap - colormaps); - // Lactozilla: Non-powers-of-two - { - fixed_t x = (((fixed_t)u) >> FRACBITS); - fixed_t y = (((fixed_t)v) >> FRACBITS); - - // Carefully align all of my Friends. - if (x < 0) - x += (libdivide_u32_do((UINT32)(-x-1), &x_divider) + 1) * ds->flatwidth; - else - x -= libdivide_u32_do((UINT32)x, &x_divider) * ds->flatwidth; - if (y < 0) - y += (libdivide_u32_do((UINT32)(-y-1), &y_divider) + 1) * ds->flatheight; - else - y -= libdivide_u32_do((UINT32)y, &y_divider) * ds->flatheight; - - *dest = *(ds->transmap + (colormap[source[((y * ds->flatwidth) + x)]] << 8) + *dest); - } - dest++; - u += stepu; - v += stepv; - } - startu = endu; - startv = endv; - width -= SPANSIZE; - } - if (width > 0) - { - if (width == 1) - { - u = (INT64)(startu); - v = (INT64)(startv); - colormap = ds->planezlight[tiltlighting[ds->x1++]] + (ds->colormap - colormaps); - // Lactozilla: Non-powers-of-two - { - fixed_t x = (((fixed_t)u) >> FRACBITS); - fixed_t y = (((fixed_t)v) >> FRACBITS); - - // Carefully align all of my Friends. - if (x < 0) - x += (libdivide_u32_do((UINT32)(-x-1), &x_divider) + 1) * ds->flatwidth; - else - x -= libdivide_u32_do((UINT32)x, &x_divider) * ds->flatwidth; - if (y < 0) - y += (libdivide_u32_do((UINT32)(-y-1), &y_divider) + 1) * ds->flatheight; - else - y -= libdivide_u32_do((UINT32)y, &y_divider) * ds->flatheight; - - *dest = *(ds->transmap + (colormap[source[((y * ds->flatwidth) + x)]] << 8) + *dest); - } - } - else - { - double left = width; - iz += ds->szp.x * left; - uz += ds->sup.x * left; - vz += ds->svp.x * left; - - endz = 1.f/iz; - endu = uz*endz; - endv = vz*endz; - left = 1.f/left; - stepu = (INT64)((endu - startu) * left); - stepv = (INT64)((endv - startv) * left); - u = (INT64)(startu); - v = (INT64)(startv); - - for (; width != 0; width--) - { - colormap = ds->planezlight[tiltlighting[ds->x1++]] + (ds->colormap - colormaps); - // Lactozilla: Non-powers-of-two - { - fixed_t x = (((fixed_t)u) >> FRACBITS); - fixed_t y = (((fixed_t)v) >> FRACBITS); - - // Carefully align all of my Friends. - if (x < 0) - x += (libdivide_u32_do((UINT32)(-x-1), &x_divider) + 1) * ds->flatwidth; - else - x -= libdivide_u32_do((UINT32)x, &x_divider) * ds->flatwidth; - if (y < 0) - y += (libdivide_u32_do((UINT32)(-y-1), &y_divider) + 1) * ds->flatheight; - else - y -= libdivide_u32_do((UINT32)y, &y_divider) * ds->flatheight; - - *dest = *(ds->transmap + (colormap[source[((y * ds->flatwidth) + x)]] << 8) + *dest); - } - dest++; - u += stepu; - v += stepv; - } - } - } -#endif -} - -void R_DrawTiltedSplat_NPO2_8(drawspandata_t* ds) -{ - // x1, x2 = ds_x1, ds_x2 - int width = ds->x2 - ds->x1; - double iz, uz, vz; - UINT32 u, v; - int i; - - UINT8 *source; - UINT8 *colormap; - UINT8 *dest; - - UINT8 val; - - double startz, startu, startv; - double izstep, uzstep, vzstep; - double endz, endu, endv; - UINT32 stepu, stepv; - INT32 tiltlighting[MAXVIDWIDTH]; - - struct libdivide_u32_t x_divider = libdivide_u32_gen(ds->flatwidth); - struct libdivide_u32_t y_divider = libdivide_u32_gen(ds->flatheight); - - iz = ds->szp.z + ds->szp.y*(centery-ds->y) + ds->szp.x*(ds->x1-centerx); - - // Lighting is simple. It's just linear interpolation from start to end - { - float planelightfloat = PLANELIGHTFLOAT; - float lightstart, lightend; - - lightend = (iz + ds->szp.x*width) * planelightfloat; - lightstart = iz * planelightfloat; - - R_CalcTiltedLighting(tiltlighting, ds->x1, ds->x2, FLOAT_TO_FIXED(lightstart), FLOAT_TO_FIXED(lightend)); - //CONS_Printf("tilted lighting %f to %f (foc %f)\n", lightstart, lightend, focallengthf); - } - - uz = ds->sup.z + ds->sup.y*(centery-ds->y) + ds->sup.x*(ds->x1-centerx); - vz = ds->svp.z + ds->svp.y*(centery-ds->y) + ds->svp.x*(ds->x1-centerx); - - dest = ylookup[ds->y] + columnofs[ds->x1]; - source = ds->source; - //colormap = ds_colormap; - -#if 0 // The "perfect" reference version of this routine. Pretty slow. - // Use it only to see how things are supposed to look. - i = 0; - do - { - double z = 1.f/iz; - u = (INT64)(uz*z); - v = (INT64)(vz*z); - - colormap = planezlight[tiltlighting[ds_x1++]] + (ds_colormap - colormaps); - - // Lactozilla: Non-powers-of-two - { - fixed_t x = (((fixed_t)u) >> FRACBITS); - fixed_t y = (((fixed_t)v) >> FRACBITS); - - // Carefully align all of my Friends. - if (x < 0) - x += (libdivide_u32_do((UINT32)(-x-1), &x_divider) + 1) * ds_flatwidth; - else - x -= libdivide_u32_do((UINT32)x, &x_divider) * ds_flatwidth; - if (y < 0) - y += (libdivide_u32_do((UINT32)(-y-1), &y_divider) + 1) * ds_flatheight; - else - y -= libdivide_u32_do((UINT32)y, &y_divider) * ds_flatheight; - - val = source[((y * ds_flatwidth) + x)]; - } - - if (val != TRANSPARENTPIXEL) - *dest = colormap[val]; - - dest++; - iz += ds_szp->x; - uz += ds_sup->x; - vz += ds_svp->x; - } while (--width >= 0); -#else - startz = 1.f/iz; - startu = uz*startz; - startv = vz*startz; - - izstep = ds->szp.x * SPANSIZE; - uzstep = ds->sup.x * SPANSIZE; - vzstep = ds->svp.x * SPANSIZE; - //x1 = 0; - width++; - - while (width >= SPANSIZE) - { - iz += izstep; - uz += uzstep; - vz += vzstep; - - endz = 1.f/iz; - endu = uz*endz; - endv = vz*endz; - stepu = (INT64)((endu - startu) * INVSPAN); - stepv = (INT64)((endv - startv) * INVSPAN); - u = (INT64)(startu); - v = (INT64)(startv); - - for (i = SPANSIZE-1; i >= 0; i--) - { - colormap = ds->planezlight[tiltlighting[ds->x1++]] + (ds->colormap - colormaps); - // Lactozilla: Non-powers-of-two - { - fixed_t x = (((fixed_t)u) >> FRACBITS); - fixed_t y = (((fixed_t)v) >> FRACBITS); - - // Carefully align all of my Friends. - if (x < 0) - x += (libdivide_u32_do((UINT32)(-x-1), &x_divider) + 1) * ds->flatwidth; - else - x -= libdivide_u32_do((UINT32)x, &x_divider) * ds->flatwidth; - if (y < 0) - y += (libdivide_u32_do((UINT32)(-y-1), &y_divider) + 1) * ds->flatheight; - else - y -= libdivide_u32_do((UINT32)y, &y_divider) * ds->flatheight; - - val = source[((y * ds->flatwidth) + x)]; - } - if (val != TRANSPARENTPIXEL) - *dest = colormap[val]; - dest++; - u += stepu; - v += stepv; - } - startu = endu; - startv = endv; - width -= SPANSIZE; - } - if (width > 0) - { - if (width == 1) - { - u = (INT64)(startu); - v = (INT64)(startv); - colormap = ds->planezlight[tiltlighting[ds->x1++]] + (ds->colormap - colormaps); - // Lactozilla: Non-powers-of-two - { - fixed_t x = (((fixed_t)u) >> FRACBITS); - fixed_t y = (((fixed_t)v) >> FRACBITS); - - // Carefully align all of my Friends. - if (x < 0) - x += (libdivide_u32_do((UINT32)(-x-1), &x_divider) + 1) * ds->flatwidth; - else - x -= libdivide_u32_do((UINT32)x, &x_divider) * ds->flatwidth; - if (y < 0) - y += (libdivide_u32_do((UINT32)(-y-1), &y_divider) + 1) * ds->flatheight; - else - y -= libdivide_u32_do((UINT32)y, &y_divider) * ds->flatheight; - - val = source[((y * ds->flatwidth) + x)]; - } - if (val != TRANSPARENTPIXEL) - *dest = colormap[val]; - } - else - { - double left = width; - iz += ds->szp.x * left; - uz += ds->sup.x * left; - vz += ds->svp.x * left; - - endz = 1.f/iz; - endu = uz*endz; - endv = vz*endz; - left = 1.f/left; - stepu = (INT64)((endu - startu) * left); - stepv = (INT64)((endv - startv) * left); - u = (INT64)(startu); - v = (INT64)(startv); - - for (; width != 0; width--) - { - colormap = ds->planezlight[tiltlighting[ds->x1++]] + (ds->colormap - colormaps); - // Lactozilla: Non-powers-of-two - { - fixed_t x = (((fixed_t)u) >> FRACBITS); - fixed_t y = (((fixed_t)v) >> FRACBITS); - - // Carefully align all of my Friends. - if (x < 0) - x += (libdivide_u32_do((UINT32)(-x-1), &x_divider) + 1) * ds->flatwidth; - else - x -= libdivide_u32_do((UINT32)x, &x_divider) * ds->flatwidth; - if (y < 0) - y += (libdivide_u32_do((UINT32)(-y-1), &y_divider) + 1) * ds->flatheight; - else - y -= libdivide_u32_do((UINT32)y, &y_divider) * ds->flatheight; - - val = source[((y * ds->flatwidth) + x)]; - } - if (val != TRANSPARENTPIXEL) - *dest = colormap[val]; - dest++; - u += stepu; - v += stepv; - } - } - } -#endif -} - -/** \brief The R_DrawSplat_NPO2_8 function - Just like R_DrawSpan_NPO2_8, but skips transparent pixels. -*/ -void R_DrawSplat_NPO2_8 (drawspandata_t* ds) -{ - fixed_t xposition; - fixed_t yposition; - fixed_t xstep, ystep; - fixed_t x, y; - fixed_t fixedwidth, fixedheight; - - UINT8 *source; - UINT8 *colormap; - UINT8 *dest; - const UINT8 *deststop = screens[0] + vid.rowbytes * vid.height; - - size_t count = (ds->x2 - ds->x1 + 1); - UINT32 val; - - xposition = ds->xfrac; yposition = ds->yfrac; - xstep = ds->xstep; ystep = ds->ystep; - - source = ds->source; - colormap = ds->colormap; - dest = ylookup[ds->y] + columnofs[ds->x1]; - - fixedwidth = ds->flatwidth << FRACBITS; - fixedheight = ds->flatheight << FRACBITS; - - // Fix xposition and yposition if they are out of bounds. - if (xposition < 0) - xposition = fixedwidth - ((UINT32)(fixedwidth - xposition) % fixedwidth); - else if (xposition >= fixedwidth) - xposition %= fixedwidth; - if (yposition < 0) - yposition = fixedheight - ((UINT32)(fixedheight - yposition) % fixedheight); - else if (yposition >= fixedheight) - yposition %= fixedheight; - - while (count-- && dest <= deststop) - { - // The loops here keep the texture coordinates within the texture. - // They will rarely iterate multiple times, and are cheaper than a modulo operation, - // even if using libdivide. - if (xstep < 0) // These if statements are hopefully hoisted by the compiler to above this loop - while (xposition < 0) - xposition += fixedwidth; - else - while (xposition >= fixedwidth) - xposition -= fixedwidth; - if (ystep < 0) - while (yposition < 0) - yposition += fixedheight; - else - while (yposition >= fixedheight) - yposition -= fixedheight; - - x = (xposition >> FRACBITS); - y = (yposition >> FRACBITS); - val = source[((y * ds->flatwidth) + x)]; - if (val != TRANSPARENTPIXEL) - *dest = colormap[val]; - dest++; - xposition += xstep; - yposition += ystep; - } -} - -/** \brief The R_DrawTranslucentSplat_NPO2_8 function - Just like R_DrawSplat_NPO2_8, but is translucent! -*/ -void R_DrawTranslucentSplat_NPO2_8 (drawspandata_t* ds) -{ - fixed_t xposition; - fixed_t yposition; - fixed_t xstep, ystep; - fixed_t x, y; - fixed_t fixedwidth, fixedheight; - - UINT8 *source; - UINT8 *colormap; - UINT8 *dest; - const UINT8 *deststop = screens[0] + vid.rowbytes * vid.height; - - size_t count = (ds->x2 - ds->x1 + 1); - UINT32 val; - - xposition = ds->xfrac; yposition = ds->yfrac; - xstep = ds->xstep; ystep = ds->ystep; - - source = ds->source; - colormap = ds->colormap; - dest = ylookup[ds->y] + columnofs[ds->x1]; - - fixedwidth = ds->flatwidth << FRACBITS; - fixedheight = ds->flatheight << FRACBITS; - - // Fix xposition and yposition if they are out of bounds. - if (xposition < 0) - xposition = fixedwidth - ((UINT32)(fixedwidth - xposition) % fixedwidth); - else if (xposition >= fixedwidth) - xposition %= fixedwidth; - if (yposition < 0) - yposition = fixedheight - ((UINT32)(fixedheight - yposition) % fixedheight); - else if (yposition >= fixedheight) - yposition %= fixedheight; - - while (count-- && dest <= deststop) - { - // The loops here keep the texture coordinates within the texture. - // They will rarely iterate multiple times, and are cheaper than a modulo operation, - // even if using libdivide. - if (xstep < 0) // These if statements are hopefully hoisted by the compiler to above this loop - while (xposition < 0) - xposition += fixedwidth; - else - while (xposition >= fixedwidth) - xposition -= fixedwidth; - if (ystep < 0) - while (yposition < 0) - yposition += fixedheight; - else - while (yposition >= fixedheight) - yposition -= fixedheight; - - x = (xposition >> FRACBITS); - y = (yposition >> FRACBITS); - val = source[((y * ds->flatwidth) + x)]; - if (val != TRANSPARENTPIXEL) - *dest = *(ds->transmap + (colormap[val] << 8) + *dest); - dest++; - xposition += xstep; - yposition += ystep; - } -} - -/** \brief The R_DrawFloorSprite_NPO2_8 function - Just like R_DrawSplat_NPO2_8, but for floor sprites. -*/ -void R_DrawFloorSprite_NPO2_8 (drawspandata_t* ds) -{ - fixed_t xposition; - fixed_t yposition; - fixed_t xstep, ystep; - fixed_t x, y; - fixed_t fixedwidth, fixedheight; - - UINT16 *source; - UINT8 *translation; - UINT8 *colormap; - UINT8 *dest; - const UINT8 *deststop = screens[0] + vid.rowbytes * vid.height; - - size_t count = (ds->x2 - ds->x1 + 1); - UINT32 val; - - xposition = ds->xfrac; yposition = ds->yfrac; - xstep = ds->xstep; ystep = ds->ystep; - - source = (UINT16 *)ds->source; - colormap = ds->colormap; - translation = ds->translation; - dest = ylookup[ds->y] + columnofs[ds->x1]; - - fixedwidth = ds->flatwidth << FRACBITS; - fixedheight = ds->flatheight << FRACBITS; - - // Fix xposition and yposition if they are out of bounds. - if (xposition < 0) - xposition = fixedwidth - ((UINT32)(fixedwidth - xposition) % fixedwidth); - else if (xposition >= fixedwidth) - xposition %= fixedwidth; - if (yposition < 0) - yposition = fixedheight - ((UINT32)(fixedheight - yposition) % fixedheight); - else if (yposition >= fixedheight) - yposition %= fixedheight; - - while (count-- && dest <= deststop) - { - // The loops here keep the texture coordinates within the texture. - // They will rarely iterate multiple times, and are cheaper than a modulo operation, - // even if using libdivide. - if (xstep < 0) // These if statements are hopefully hoisted by the compiler to above this loop - while (xposition < 0) - xposition += fixedwidth; - else - while (xposition >= fixedwidth) - xposition -= fixedwidth; - if (ystep < 0) - while (yposition < 0) - yposition += fixedheight; - else - while (yposition >= fixedheight) - yposition -= fixedheight; - - x = (xposition >> FRACBITS); - y = (yposition >> FRACBITS); - val = source[((y * ds->flatwidth) + x)]; - if (val & 0xFF00) - *dest = colormap[translation[val & 0xFF]]; - dest++; - xposition += xstep; - yposition += ystep; - } -} - -/** \brief The R_DrawTranslucentFloorSprite_NPO2_8 function - Just like R_DrawFloorSprite_NPO2_8, but is translucent! -*/ -void R_DrawTranslucentFloorSprite_NPO2_8 (drawspandata_t* ds) -{ - fixed_t xposition; - fixed_t yposition; - fixed_t xstep, ystep; - fixed_t x, y; - fixed_t fixedwidth, fixedheight; - - UINT16 *source; - UINT8 *translation; - UINT8 *colormap; - UINT8 *dest; - const UINT8 *deststop = screens[0] + vid.rowbytes * vid.height; - - size_t count = (ds->x2 - ds->x1 + 1); - UINT32 val; - - xposition = ds->xfrac; yposition = ds->yfrac; - xstep = ds->xstep; ystep = ds->ystep; - - source = (UINT16 *)ds->source; - colormap = ds->colormap; - translation = ds->translation; - dest = ylookup[ds->y] + columnofs[ds->x1]; - - fixedwidth = ds->flatwidth << FRACBITS; - fixedheight = ds->flatheight << FRACBITS; - - // Fix xposition and yposition if they are out of bounds. - if (xposition < 0) - xposition = fixedwidth - ((UINT32)(fixedwidth - xposition) % fixedwidth); - else if (xposition >= fixedwidth) - xposition %= fixedwidth; - if (yposition < 0) - yposition = fixedheight - ((UINT32)(fixedheight - yposition) % fixedheight); - else if (yposition >= fixedheight) - yposition %= fixedheight; - - while (count-- && dest <= deststop) - { - // The loops here keep the texture coordinates within the texture. - // They will rarely iterate multiple times, and are cheaper than a modulo operation, - // even if using libdivide. - if (xstep < 0) // These if statements are hopefully hoisted by the compiler to above this loop - while (xposition < 0) - xposition += fixedwidth; - else - while (xposition >= fixedwidth) - xposition -= fixedwidth; - if (ystep < 0) - while (yposition < 0) - yposition += fixedheight; - else - while (yposition >= fixedheight) - yposition -= fixedheight; - - x = (xposition >> FRACBITS); - y = (yposition >> FRACBITS); - val = source[((y * ds->flatwidth) + x)]; - if (val & 0xFF00) - *dest = *(ds->transmap + (colormap[translation[val & 0xFF]] << 8) + *dest); - dest++; - xposition += xstep; - yposition += ystep; - } -} - -/** \brief The R_DrawTiltedFloorSprite_NPO2_8 function - Draws a tilted floor sprite. -*/ -void R_DrawTiltedFloorSprite_NPO2_8(drawspandata_t* ds) -{ - // x1, x2 = ds_x1, ds_x2 - int width = ds->x2 - ds->x1; - double iz, uz, vz; - UINT32 u, v; - int i; - - UINT16 *source; - UINT8 *colormap; - UINT8 *translation; - UINT8 *dest; - UINT16 val; - - double startz, startu, startv; - double izstep, uzstep, vzstep; - double endz, endu, endv; - UINT32 stepu, stepv; - - struct libdivide_u32_t x_divider = libdivide_u32_gen(ds->flatwidth); - struct libdivide_u32_t y_divider = libdivide_u32_gen(ds->flatheight); - - iz = ds->szp.z + ds->szp.y*(centery-ds->y) + ds->szp.x*(ds->x1-centerx); - uz = ds->sup.z + ds->sup.y*(centery-ds->y) + ds->sup.x*(ds->x1-centerx); - vz = ds->svp.z + ds->svp.y*(centery-ds->y) + ds->svp.x*(ds->x1-centerx); - - dest = ylookup[ds->y] + columnofs[ds->x1]; - source = (UINT16 *)ds->source; - colormap = ds->colormap; - translation = ds->translation; - - startz = 1.f/iz; - startu = uz*startz; - startv = vz*startz; - - izstep = ds->szp.x * SPANSIZE; - uzstep = ds->sup.x * SPANSIZE; - vzstep = ds->svp.x * SPANSIZE; - //x1 = 0; - width++; - - while (width >= SPANSIZE) - { - iz += izstep; - uz += uzstep; - vz += vzstep; - - endz = 1.f/iz; - endu = uz*endz; - endv = vz*endz; - stepu = (INT64)((endu - startu) * INVSPAN); - stepv = (INT64)((endv - startv) * INVSPAN); - u = (INT64)(startu); - v = (INT64)(startv); - - for (i = SPANSIZE-1; i >= 0; i--) - { - // Lactozilla: Non-powers-of-two - fixed_t x = (((fixed_t)u) >> FRACBITS); - fixed_t y = (((fixed_t)v) >> FRACBITS); - - // Carefully align all of my Friends. - if (x < 0) - x += (libdivide_u32_do((UINT32)(-x-1), &x_divider) + 1) * ds->flatwidth; - else - x -= libdivide_u32_do((UINT32)x, &x_divider) * ds->flatwidth; - if (y < 0) - y += (libdivide_u32_do((UINT32)(-y-1), &y_divider) + 1) * ds->flatheight; - else - y -= libdivide_u32_do((UINT32)y, &y_divider) * ds->flatheight; - - val = source[((y * ds->flatwidth) + x)]; - if (val & 0xFF00) - *dest = colormap[translation[val & 0xFF]]; - dest++; - - u += stepu; - v += stepv; - } - startu = endu; - startv = endv; - width -= SPANSIZE; - } - if (width > 0) - { - if (width == 1) - { - u = (INT64)(startu); - v = (INT64)(startv); - // Lactozilla: Non-powers-of-two - { - fixed_t x = (((fixed_t)u) >> FRACBITS); - fixed_t y = (((fixed_t)v) >> FRACBITS); - - // Carefully align all of my Friends. - if (x < 0) - x += (libdivide_u32_do((UINT32)(-x-1), &x_divider) + 1) * ds->flatwidth; - else - x -= libdivide_u32_do((UINT32)x, &x_divider) * ds->flatwidth; - if (y < 0) - y += (libdivide_u32_do((UINT32)(-y-1), &y_divider) + 1) * ds->flatheight; - else - y -= libdivide_u32_do((UINT32)y, &y_divider) * ds->flatheight; - - val = source[((y * ds->flatwidth) + x)]; - if (val & 0xFF00) - *dest = colormap[translation[val & 0xFF]]; - } - } - else - { - double left = width; - iz += ds->szp.x * left; - uz += ds->sup.x * left; - vz += ds->svp.x * left; - - endz = 1.f/iz; - endu = uz*endz; - endv = vz*endz; - left = 1.f/left; - stepu = (INT64)((endu - startu) * left); - stepv = (INT64)((endv - startv) * left); - u = (INT64)(startu); - v = (INT64)(startv); - - for (; width != 0; width--) - { - // Lactozilla: Non-powers-of-two - fixed_t x = (((fixed_t)u) >> FRACBITS); - fixed_t y = (((fixed_t)v) >> FRACBITS); - - // Carefully align all of my Friends. - if (x < 0) - x += (libdivide_u32_do((UINT32)(-x-1), &x_divider) + 1) * ds->flatwidth; - else - x -= libdivide_u32_do((UINT32)x, &x_divider) * ds->flatwidth; - if (y < 0) - y += (libdivide_u32_do((UINT32)(-y-1), &y_divider) + 1) * ds->flatheight; - else - y -= libdivide_u32_do((UINT32)y, &y_divider) * ds->flatheight; - - val = source[((y * ds->flatwidth) + x)]; - if (val & 0xFF00) - *dest = colormap[translation[val & 0xFF]]; - dest++; - - u += stepu; - v += stepv; - } - } - } -} - -/** \brief The R_DrawTiltedTranslucentFloorSprite_NPO2_8 function - Draws a tilted, translucent, floor sprite. -*/ -void R_DrawTiltedTranslucentFloorSprite_NPO2_8(drawspandata_t* ds) -{ - // x1, x2 = ds_x1, ds_x2 - int width = ds->x2 - ds->x1; - double iz, uz, vz; - UINT32 u, v; - int i; - - UINT16 *source; - UINT8 *colormap; - UINT8 *translation; - UINT8 *dest; - UINT16 val; - - double startz, startu, startv; - double izstep, uzstep, vzstep; - double endz, endu, endv; - UINT32 stepu, stepv; - - struct libdivide_u32_t x_divider = libdivide_u32_gen(ds->flatwidth); - struct libdivide_u32_t y_divider = libdivide_u32_gen(ds->flatheight); - - iz = ds->szp.z + ds->szp.y*(centery-ds->y) + ds->szp.x*(ds->x1-centerx); - uz = ds->sup.z + ds->sup.y*(centery-ds->y) + ds->sup.x*(ds->x1-centerx); - vz = ds->svp.z + ds->svp.y*(centery-ds->y) + ds->svp.x*(ds->x1-centerx); - - dest = ylookup[ds->y] + columnofs[ds->x1]; - source = (UINT16 *)ds->source; - colormap = ds->colormap; - translation = ds->translation; - - startz = 1.f/iz; - startu = uz*startz; - startv = vz*startz; - - izstep = ds->szp.x * SPANSIZE; - uzstep = ds->sup.x * SPANSIZE; - vzstep = ds->svp.x * SPANSIZE; - //x1 = 0; - width++; - - while (width >= SPANSIZE) - { - iz += izstep; - uz += uzstep; - vz += vzstep; - - endz = 1.f/iz; - endu = uz*endz; - endv = vz*endz; - stepu = (INT64)((endu - startu) * INVSPAN); - stepv = (INT64)((endv - startv) * INVSPAN); - u = (INT64)(startu); - v = (INT64)(startv); - - for (i = SPANSIZE-1; i >= 0; i--) - { - // Lactozilla: Non-powers-of-two - fixed_t x = (((fixed_t)u) >> FRACBITS); - fixed_t y = (((fixed_t)v) >> FRACBITS); - - // Carefully align all of my Friends. - if (x < 0) - x += (libdivide_u32_do((UINT32)(-x-1), &x_divider) + 1) * ds->flatwidth; - else - x -= libdivide_u32_do((UINT32)x, &x_divider) * ds->flatwidth; - if (y < 0) - y += (libdivide_u32_do((UINT32)(-y-1), &y_divider) + 1) * ds->flatheight; - else - y -= libdivide_u32_do((UINT32)y, &y_divider) * ds->flatheight; - - val = source[((y * ds->flatwidth) + x)]; - if (val & 0xFF00) - *dest = *(ds->transmap + (colormap[translation[val & 0xFF]] << 8) + *dest); - dest++; - - u += stepu; - v += stepv; - } - startu = endu; - startv = endv; - width -= SPANSIZE; - } - if (width > 0) - { - if (width == 1) - { - u = (INT64)(startu); - v = (INT64)(startv); - // Lactozilla: Non-powers-of-two - { - fixed_t x = (((fixed_t)u) >> FRACBITS); - fixed_t y = (((fixed_t)v) >> FRACBITS); - - // Carefully align all of my Friends. - if (x < 0) - x += (libdivide_u32_do((UINT32)(-x-1), &x_divider) + 1) * ds->flatwidth; - else - x -= libdivide_u32_do((UINT32)x, &x_divider) * ds->flatwidth; - if (y < 0) - y += (libdivide_u32_do((UINT32)(-y-1), &y_divider) + 1) * ds->flatheight; - else - y -= libdivide_u32_do((UINT32)y, &y_divider) * ds->flatheight; - - val = source[((y * ds->flatwidth) + x)]; - if (val & 0xFF00) - *dest = *(ds->transmap + (colormap[translation[val & 0xFF]] << 8) + *dest); - } - } - else - { - double left = width; - iz += ds->szp.x * left; - uz += ds->sup.x * left; - vz += ds->svp.x * left; - - endz = 1.f/iz; - endu = uz*endz; - endv = vz*endz; - left = 1.f/left; - stepu = (INT64)((endu - startu) * left); - stepv = (INT64)((endv - startv) * left); - u = (INT64)(startu); - v = (INT64)(startv); - - for (; width != 0; width--) - { - // Lactozilla: Non-powers-of-two - fixed_t x = (((fixed_t)u) >> FRACBITS); - fixed_t y = (((fixed_t)v) >> FRACBITS); - - // Carefully align all of my Friends. - if (x < 0) - x += (libdivide_u32_do((UINT32)(-x-1), &x_divider) + 1) * ds->flatwidth; - else - x -= libdivide_u32_do((UINT32)x, &x_divider) * ds->flatwidth; - if (y < 0) - y += (libdivide_u32_do((UINT32)(-y-1), &y_divider) + 1) * ds->flatheight; - else - y -= libdivide_u32_do((UINT32)y, &y_divider) * ds->flatheight; - - val = source[((y * ds->flatwidth) + x)]; - if (val & 0xFF00) - *dest = *(ds->transmap + (colormap[translation[val & 0xFF]] << 8) + *dest); - dest++; - - u += stepu; - v += stepv; - } - } - } -} - -/** \brief The R_DrawTranslucentSpan_NPO2_8 function - Draws the actual span with translucency. -*/ -void R_DrawTranslucentSpan_NPO2_8 (drawspandata_t* ds) -{ - fixed_t xposition; - fixed_t yposition; - fixed_t xstep, ystep; - fixed_t x, y; - fixed_t fixedwidth, fixedheight; - - UINT8 *source; - UINT8 *colormap; - UINT8 *dest; - const UINT8 *deststop = screens[0] + vid.rowbytes * vid.height; - - size_t count = (ds->x2 - ds->x1 + 1); - UINT32 val; - - xposition = ds->xfrac; yposition = ds->yfrac; - xstep = ds->xstep; ystep = ds->ystep; - - source = ds->source; - colormap = ds->colormap; - dest = ylookup[ds->y] + columnofs[ds->x1]; - - fixedwidth = ds->flatwidth << FRACBITS; - fixedheight = ds->flatheight << FRACBITS; - - // Fix xposition and yposition if they are out of bounds. - if (xposition < 0) - xposition = fixedwidth - ((UINT32)(fixedwidth - xposition) % fixedwidth); - else if (xposition >= fixedwidth) - xposition %= fixedwidth; - if (yposition < 0) - yposition = fixedheight - ((UINT32)(fixedheight - yposition) % fixedheight); - else if (yposition >= fixedheight) - yposition %= fixedheight; - - while (count-- && dest <= deststop) - { - // The loops here keep the texture coordinates within the texture. - // They will rarely iterate multiple times, and are cheaper than a modulo operation, - // even if using libdivide. - if (xstep < 0) // These if statements are hopefully hoisted by the compiler to above this loop - while (xposition < 0) - xposition += fixedwidth; - else - while (xposition >= fixedwidth) - xposition -= fixedwidth; - if (ystep < 0) - while (yposition < 0) - yposition += fixedheight; - else - while (yposition >= fixedheight) - yposition -= fixedheight; - - x = (xposition >> FRACBITS); - y = (yposition >> FRACBITS); - val = ((y * ds->flatwidth) + x); - *dest = *(ds->transmap + (colormap[source[val]] << 8) + *dest); - dest++; - xposition += xstep; - yposition += ystep; - } -} - -void R_DrawTranslucentWaterSpan_NPO2_8(drawspandata_t* ds) -{ - fixed_t xposition; - fixed_t yposition; - fixed_t xstep, ystep; - fixed_t x, y; - fixed_t fixedwidth, fixedheight; - - UINT8 *source; - UINT8 *colormap; - UINT8 *dest; - UINT8 *dsrc; - const UINT8 *deststop = screens[0] + vid.rowbytes * vid.height; - - size_t count = (ds->x2 - ds->x1 + 1); - - xposition = ds->xfrac; yposition = (ds->yfrac + ds->waterofs); - xstep = ds->xstep; ystep = ds->ystep; - - source = ds->source; - colormap = ds->colormap; - dest = ylookup[ds->y] + columnofs[ds->x1]; - dsrc = screens[1] + (ds->y+ds->bgofs)*vid.width + ds->x1; - - fixedwidth = ds->flatwidth << FRACBITS; - fixedheight = ds->flatheight << FRACBITS; - - // Fix xposition and yposition if they are out of bounds. - if (xposition < 0) - xposition = fixedwidth - ((UINT32)(fixedwidth - xposition) % fixedwidth); - else if (xposition >= fixedwidth) - xposition %= fixedwidth; - if (yposition < 0) - yposition = fixedheight - ((UINT32)(fixedheight - yposition) % fixedheight); - else if (yposition >= fixedheight) - yposition %= fixedheight; - - while (count-- && dest <= deststop) - { - // The loops here keep the texture coordinates within the texture. - // They will rarely iterate multiple times, and are cheaper than a modulo operation, - // even if using libdivide. - if (xstep < 0) // These if statements are hopefully hoisted by the compiler to above this loop - while (xposition < 0) - xposition += fixedwidth; - else - while (xposition >= fixedwidth) - xposition -= fixedwidth; - if (ystep < 0) - while (yposition < 0) - yposition += fixedheight; - else - while (yposition >= fixedheight) - yposition -= fixedheight; - - x = (xposition >> FRACBITS); - y = (yposition >> FRACBITS); - *dest++ = colormap[*(ds->transmap + (source[((y * ds->flatwidth) + x)] << 8) + *dsrc++)]; - xposition += xstep; - yposition += ystep; - } -} - -/** \brief The R_DrawTiltedTranslucentWaterSpan_NPO2_8 function - Like DrawTiltedTranslucentSpan_NPO2, but for water -*/ -void R_DrawTiltedTranslucentWaterSpan_NPO2_8(drawspandata_t* ds) -{ - // x1, x2 = ds_x1, ds_x2 - int width = ds->x2 - ds->x1; - double iz, uz, vz; - UINT32 u, v; - int i; - - UINT8 *source; - UINT8 *colormap; - UINT8 *dest; - UINT8 *dsrc; - - double startz, startu, startv; - double izstep, uzstep, vzstep; - double endz, endu, endv; - UINT32 stepu, stepv; - INT32 tiltlighting[MAXVIDWIDTH]; - - struct libdivide_u32_t x_divider = libdivide_u32_gen(ds->flatwidth); - struct libdivide_u32_t y_divider = libdivide_u32_gen(ds->flatheight); - - iz = ds->szp.z + ds->szp.y*(centery-ds->y) + ds->szp.x*(ds->x1-centerx); - - // Lighting is simple. It's just linear interpolation from start to end - { - float planelightfloat = PLANELIGHTFLOAT; - float lightstart, lightend; - - lightend = (iz + ds->szp.x*width) * planelightfloat; - lightstart = iz * planelightfloat; - - R_CalcTiltedLighting(tiltlighting, ds->x1, ds->x2, FLOAT_TO_FIXED(lightstart), FLOAT_TO_FIXED(lightend)); - //CONS_Printf("tilted lighting %f to %f (foc %f)\n", lightstart, lightend, focallengthf); - } - - uz = ds->sup.z + ds->sup.y*(centery-ds->y) + ds->sup.x*(ds->x1-centerx); - vz = ds->svp.z + ds->svp.y*(centery-ds->y) + ds->svp.x*(ds->x1-centerx); - - dest = ylookup[ds->y] + columnofs[ds->x1]; - dsrc = screens[1] + (ds->y+ds->bgofs)*vid.width + ds->x1; - source = ds->source; - //colormap = ds->colormap; - -#if 0 // The "perfect" reference version of this routine. Pretty slow. - // Use it only to see how things are supposed to look. - i = 0; - do - { - double z = 1.f/iz; - u = (INT64)(uz*z); - v = (INT64)(vz*z); - - colormap = planezlight[tiltlighting[ds_x1++]] + (ds_colormap - colormaps); - // Lactozilla: Non-powers-of-two - { - fixed_t x = (((fixed_t)u) >> FRACBITS); - fixed_t y = (((fixed_t)v) >> FRACBITS); - - // Carefully align all of my Friends. - if (x < 0) - x += (libdivide_u32_do((UINT32)(-x-1), &x_divider) + 1) * ds_flatwidth; - else - x -= libdivide_u32_do((UINT32)x, &x_divider) * ds_flatwidth; - if (y < 0) - y += (libdivide_u32_do((UINT32)(-y-1), &y_divider) + 1) * ds_flatheight; - else - y -= libdivide_u32_do((UINT32)y, &y_divider) * ds_flatheight; - - *dest = *(ds_transmap + (colormap[source[((y * ds_flatwidth) + x)]] << 8) + *dsrc++); - } - dest++; - iz += ds_szp->x; - uz += ds_sup->x; - vz += ds_svp->x; - } while (--width >= 0); -#else - startz = 1.f/iz; - startu = uz*startz; - startv = vz*startz; - - izstep = ds->szp.x * SPANSIZE; - uzstep = ds->sup.x * SPANSIZE; - vzstep = ds->svp.x * SPANSIZE; - //x1 = 0; - width++; - - while (width >= SPANSIZE) - { - iz += izstep; - uz += uzstep; - vz += vzstep; - - endz = 1.f/iz; - endu = uz*endz; - endv = vz*endz; - stepu = (INT64)((endu - startu) * INVSPAN); - stepv = (INT64)((endv - startv) * INVSPAN); - u = (INT64)(startu); - v = (INT64)(startv); - - for (i = SPANSIZE-1; i >= 0; i--) - { - colormap = ds->planezlight[tiltlighting[ds->x1++]] + (ds->colormap - colormaps); - // Lactozilla: Non-powers-of-two - { - fixed_t x = (((fixed_t)u) >> FRACBITS); - fixed_t y = (((fixed_t)v) >> FRACBITS); - - // Carefully align all of my Friends. - if (x < 0) - x += (libdivide_u32_do((UINT32)(-x-1), &x_divider) + 1) * ds->flatwidth; - else - x -= libdivide_u32_do((UINT32)x, &x_divider) * ds->flatwidth; - if (y < 0) - y += (libdivide_u32_do((UINT32)(-y-1), &y_divider) + 1) * ds->flatheight; - else - y -= libdivide_u32_do((UINT32)y, &y_divider) * ds->flatheight; - - *dest = *(ds->transmap + (colormap[source[((y * ds->flatwidth) + x)]] << 8) + *dsrc++); - } - dest++; - u += stepu; - v += stepv; - } - startu = endu; - startv = endv; - width -= SPANSIZE; - } - if (width > 0) - { - if (width == 1) - { - u = (INT64)(startu); - v = (INT64)(startv); - colormap = ds->planezlight[tiltlighting[ds->x1++]] + (ds->colormap - colormaps); - // Lactozilla: Non-powers-of-two - { - fixed_t x = (((fixed_t)u) >> FRACBITS); - fixed_t y = (((fixed_t)v) >> FRACBITS); - - // Carefully align all of my Friends. - if (x < 0) - x += (libdivide_u32_do((UINT32)(-x-1), &x_divider) + 1) * ds->flatwidth; - else - x -= libdivide_u32_do((UINT32)x, &x_divider) * ds->flatwidth; - if (y < 0) - y += (libdivide_u32_do((UINT32)(-y-1), &y_divider) + 1) * ds->flatheight; - else - y -= libdivide_u32_do((UINT32)y, &y_divider) * ds->flatheight; - - *dest = *(ds->transmap + (colormap[source[((y * ds->flatwidth) + x)]] << 8) + *dsrc++); - } - } - else - { - double left = width; - iz += ds->szp.x * left; - uz += ds->sup.x * left; - vz += ds->svp.x * left; - - endz = 1.f/iz; - endu = uz*endz; - endv = vz*endz; - left = 1.f/left; - stepu = (INT64)((endu - startu) * left); - stepv = (INT64)((endv - startv) * left); - u = (INT64)(startu); - v = (INT64)(startv); - - for (; width != 0; width--) - { - colormap = ds->planezlight[tiltlighting[ds->x1++]] + (ds->colormap - colormaps); - // Lactozilla: Non-powers-of-two - { - fixed_t x = (((fixed_t)u) >> FRACBITS); - fixed_t y = (((fixed_t)v) >> FRACBITS); - - // Carefully align all of my Friends. - if (x < 0) - x += (libdivide_u32_do((UINT32)(-x-1), &x_divider) + 1) * ds->flatwidth; - else - x -= libdivide_u32_do((UINT32)x, &x_divider) * ds->flatwidth; - if (y < 0) - y += (libdivide_u32_do((UINT32)(-y-1), &y_divider) + 1) * ds->flatheight; - else - y -= libdivide_u32_do((UINT32)y, &y_divider) * ds->flatheight; - - *dest = *(ds->transmap + (colormap[source[((y * ds->flatwidth) + x)]] << 8) + *dsrc++); - } - dest++; - u += stepu; - v += stepv; - } - } - } -#endif -} diff --git a/src/r_draw_column.cpp b/src/r_draw_column.cpp new file mode 100644 index 000000000..93f6b8a8d --- /dev/null +++ b/src/r_draw_column.cpp @@ -0,0 +1,413 @@ +// SONIC ROBO BLAST 2 +//----------------------------------------------------------------------------- +// Copyright (C) 1998-2000 by DooM Legacy Team. +// Copyright (C) 1999-2021 by Sonic Team Junior. +// +// This program is free software distributed under the +// terms of the GNU General Public License, version 2. +// See the 'LICENSE' file for more details. +//----------------------------------------------------------------------------- +/// \file r_draw_column.cpp +/// \brief column drawer functions +/// \note no includes because this is included as part of r_draw.cpp + +// ========================================================================== +// COLUMNS +// ========================================================================== + +// A column is a vertical slice/span of a wall texture that uses +// a has a constant z depth from top to bottom. +// + +enum DrawColumnType +{ + DC_BASIC = 0x0000, + DC_COLORMAP = 0x0001, + DC_TRANSMAP = 0x0002, + DC_BRIGHTMAP = 0x0004, + DC_HOLES = 0x0008, + DC_LIGHTLIST = 0x0010, +}; + +template +static constexpr UINT8 R_GetColumnTranslated(drawcolumndata_t* dc, UINT8 col) +{ + if constexpr (Type & DrawColumnType::DC_COLORMAP) + { + return dc->translation[col]; + } + else + { + return col; + } +} + +template +static constexpr UINT8 R_GetColumnBrightmapped(drawcolumndata_t* dc, UINT32 bit, UINT8 col) +{ + col = R_GetColumnTranslated(dc, col); + + if constexpr (Type & DrawColumnType::DC_BRIGHTMAP) + { + if (dc->brightmap[bit] == BRIGHTPIXEL) + { + return dc->fullbright[col]; + } + } + + return dc->colormap[col]; +} + +template +static constexpr UINT8 R_GetColumnTranslucent(drawcolumndata_t* dc, UINT8 *dest, UINT32 bit, UINT8 col) +{ + col = R_GetColumnBrightmapped(dc, bit, col); + + if constexpr (Type & DrawColumnType::DC_TRANSMAP) + { + return *(dc->transmap + (col << 8) + (*dest)); + } + else + { + return col; + } +} + +template +static constexpr UINT8 R_DrawColumnPixel(drawcolumndata_t* dc, UINT8 *dest, UINT32 bit) +{ + UINT8 col = dc->source[bit]; + + if constexpr (Type & DrawColumnType::DC_HOLES) + { + if (col == TRANSPARENTPIXEL) + { + return *dest; + } + } + + return R_GetColumnTranslucent(dc, dest, bit, col); +} + +/** \brief The R_DrawColumn function + Experiment to make software go faster. Taken from the Boom source +*/ +template +static void R_DrawColumnTemplate(drawcolumndata_t *dc) +{ + INT32 count; + UINT8 *dest; + + count = dc->yh - dc->yl; + + if (count < 0) // Zero length, column does not exceed a pixel. + { + return; + } + +#ifdef RANGECHECK + if ((unsigned)dc->x >= (unsigned)vid.width || dc->yl < 0 || dc->yh >= vid.height) + { + return; + } +#endif + + if constexpr (Type & DrawColumnType::DC_LIGHTLIST) + { + constexpr DrawColumnType NewType = static_cast(Type & ~DC_LIGHTLIST); + INT32 i, realyh, height, bheight = 0, solid = 0; + drawcolumndata_t dc_copy = *dc; + + realyh = dc_copy.yh; + + // This runs through the lightlist from top to bottom and cuts up the column accordingly. + for (i = 0; i < dc->numlights; i++) + { + // If the height of the light is above the column, get the colormap + // anyway because the lighting of the top should be affected. + solid = dc->lightlist[i].flags & FOF_CUTSOLIDS; + height = dc->lightlist[i].height >> LIGHTSCALESHIFT; + + if (solid) + { + bheight = dc->lightlist[i].botheight >> LIGHTSCALESHIFT; + + if (bheight < height) + { + // confounded slopes sometimes allow partial invertedness, + // even including cases where the top and bottom heights + // should actually be the same! + // swap the height values as a workaround for this quirk + INT32 temp = height; + height = bheight; + bheight = temp; + } + } + + if (height <= dc_copy.yl) + { + dc_copy.colormap = dc->lightlist[i].rcolormap; + dc_copy.fullbright = colormaps; + + if (encoremap) + { + dc_copy.colormap += COLORMAP_REMAPOFFSET; + dc_copy.fullbright += COLORMAP_REMAPOFFSET; + } + + if (solid && dc_copy.yl < bheight) + { + dc_copy.yl = bheight; + } + + continue; + } + + // Found a break in the column! + dc_copy.yh = height; + + if (dc_copy.yh > realyh) + { + dc_copy.yh = realyh; + } + + R_DrawColumnTemplate(&dc_copy); + if (solid) + { + dc_copy.yl = bheight; + } + else + { + dc_copy.yl = dc_copy.yh + 1; + } + + dc_copy.colormap = dc_copy.lightlist[i].rcolormap; + dc_copy.fullbright = colormaps; + if (encoremap) + { + dc_copy.colormap += COLORMAP_REMAPOFFSET; + dc_copy.fullbright += COLORMAP_REMAPOFFSET; + } + } + + dc_copy.yh = realyh; + + if (dc_copy.yl <= realyh) + { + R_DrawColumnTemplate(&dc_copy); + } + } + else + { + fixed_t fracstep; + fixed_t frac; + INT32 heightmask; + + // Framebuffer destination address. + // Use ylookup LUT to avoid multiply with ScreenWidth. + // Use columnofs LUT for subwindows? + + //dest = ylookup[dc_yl] + columnofs[dc_x]; + dest = &topleft[dc->yl * vid.width + dc->x]; + + count++; + + // Determine scaling, which is the only mapping to be done. + fracstep = dc->iscale; + //frac = dc_texturemid + (dc_yl - centery)*fracstep; + frac = (dc->texturemid + FixedMul((dc->yl << FRACBITS) - centeryfrac, fracstep)) * (!dc->hires); + + // Inner loop that does the actual texture mapping, e.g. a DDA-like scaling. + // This is as fast as it gets. + heightmask = dc->texheight-1; + + if (dc->texheight & heightmask) // not a power of 2 -- killough + { + heightmask++; + heightmask <<= FRACBITS; + + if (frac < 0) + { + while ((frac += heightmask) < 0) + { + ; + } + } + else + { + while (frac >= heightmask) + { + frac -= heightmask; + } + } + + do + { + // Re-map color indices from wall texture column + // using a lighting/special effects LUT. + // heightmask is the Tutti-Frutti fix + *dest = R_DrawColumnPixel(dc, dest, frac >> FRACBITS); + + dest += vid.width; + + // Avoid overflow. + if (fracstep > 0x7FFFFFFF - frac) + { + frac += fracstep - heightmask; + } + else + { + frac += fracstep; + } + + while (frac >= heightmask) + { + frac -= heightmask; + } + } + while (--count); + } + else + { + while ((count -= 2) >= 0) // texture height is a power of 2 + { + *dest = R_DrawColumnPixel(dc, dest, (frac>>FRACBITS) & heightmask); + + dest += vid.width; + frac += fracstep; + + *dest = R_DrawColumnPixel(dc, dest, (frac>>FRACBITS) & heightmask); + + dest += vid.width; + frac += fracstep; + } + + if (count & 1) + { + *dest = R_DrawColumnPixel(dc, dest, (frac>>FRACBITS) & heightmask); + } + } + } +} + +#define DEFINE_COLUMN_FUNC(name, flags) \ + void name(drawcolumndata_t *dc) \ + { \ + ZoneScoped; \ + constexpr DrawColumnType opt = static_cast(flags); \ + R_DrawColumnTemplate(dc); \ + } + +#define DEFINE_COLUMN_COMBO(name, flags) \ + DEFINE_COLUMN_FUNC(name, flags) \ + DEFINE_COLUMN_FUNC(name ## _Brightmap, flags|DC_BRIGHTMAP) + +DEFINE_COLUMN_COMBO(R_DrawColumn, DC_BASIC) +DEFINE_COLUMN_COMBO(R_DrawTranslucentColumn, DC_TRANSMAP) +DEFINE_COLUMN_COMBO(R_DrawTranslatedColumn, DC_COLORMAP) +DEFINE_COLUMN_COMBO(R_DrawColumnShadowed, DC_LIGHTLIST) +DEFINE_COLUMN_COMBO(R_DrawTranslatedTranslucentColumn, DC_COLORMAP|DC_TRANSMAP) +DEFINE_COLUMN_COMBO(R_Draw2sMultiPatchColumn, DC_HOLES) +DEFINE_COLUMN_COMBO(R_Draw2sMultiPatchTranslucentColumn, DC_HOLES|DC_TRANSMAP) + +void R_DrawFogColumn(drawcolumndata_t *dc) +{ + ZoneScoped; + + INT32 count; + UINT8 *dest; + + count = dc->yh - dc->yl; + + // Zero length, column does not exceed a pixel. + if (count < 0) + return; + +#ifdef RANGECHECK + if ((unsigned)dc->x >= (unsigned)vid.width || dc->yl < 0 || dc->yh >= vid.height) + return; +#endif + + // Framebuffer destination address. + // Use ylookup LUT to avoid multiply with ScreenWidth. + // Use columnofs LUT for subwindows? + //dest = ylookup[dc_yl] + columnofs[dc_x]; + dest = &topleft[dc->yl*vid.width + dc->x]; + + // Determine scaling, which is the only mapping to be done. + do + { + // Simple. Apply the colormap to what's already on the screen. + *dest = dc->colormap[*dest]; + dest += vid.width; + } + while (count--); +} + +void R_DrawDropShadowColumn(drawcolumndata_t *dc) +{ + ZoneScoped; + + // Hack: A cut-down copy of R_DrawTranslucentColumn_8 that does not read texture + // data since something about calculating the texture reading address for drop shadows is broken. + // dc_texturemid and dc_iscale get wrong values for drop shadows, however those are not strictly + // needed for the current design of the shadows, so this function bypasses the issue + // by not using those variables at all. + + INT32 count; + UINT8 *dest; + + count = dc->yh - dc->yl + 1; + + if (count <= 0) // Zero length, column does not exceed a pixel. + return; + + dest = &topleft[dc->yl*vid.width + dc->x]; + + const UINT8 *transmap_offset = dc->transmap + (dc->shadowcolor << 8); + while ((count -= 2) >= 0) + { + *dest = *(transmap_offset + (*dest)); + dest += vid.width; + *dest = *(transmap_offset + (*dest)); + dest += vid.width; + } + + if (count & 1) + *dest = *(transmap_offset + (*dest)); +} + +void R_DrawColumn_Flat(drawcolumndata_t *dc) +{ + ZoneScoped; + + INT32 count; + UINT8 color = dc->lightmap[dc->r8_flatcolor]; + UINT8 *dest; + + count = dc->yh - dc->yl; + + if (count < 0) // Zero length, column does not exceed a pixel. + return; + +#ifdef RANGECHECK + if ((unsigned)dc->x >= (unsigned)vid.width || dc->yl < 0 || dc->yh >= vid.height) + return; +#endif + + // Framebuffer destination address. + // Use ylookup LUT to avoid multiply with ScreenWidth. + // Use columnofs LUT for subwindows? + + //dest = ylookup[dc_yl] + columnofs[dc_x]; + dest = &topleft[dc->yl*vid.width + dc->x]; + + count++; + + do + { + *dest = color; + dest += vid.width; + } + while (--count); +} diff --git a/src/r_draw_span.cpp b/src/r_draw_span.cpp new file mode 100644 index 000000000..bd7d2cc41 --- /dev/null +++ b/src/r_draw_span.cpp @@ -0,0 +1,866 @@ +// SONIC ROBO BLAST 2 +//----------------------------------------------------------------------------- +// Copyright (C) 1998-2000 by DooM Legacy Team. +// Copyright (C) 1999-2021 by Sonic Team Junior. +// +// This program is free software distributed under the +// terms of the GNU General Public License, version 2. +// See the 'LICENSE' file for more details. +//----------------------------------------------------------------------------- +/// \file r_draw_span.cpp +/// \brief span drawer functions +/// \note no includes because this is included as part of r_draw.cpp + +using namespace libdivide; + +// ========================================================================== +// SPANS +// ========================================================================== + +#define SPANSIZE 16 +#define INVSPAN 0.0625f + +// 4194303 = (2048x2048)-1 (2048x2048 is maximum flat size) +#define MAXFLATBYTES 4194303 + +#define PLANELIGHTFLOAT (BASEVIDWIDTH * BASEVIDWIDTH / vid.width / ds->zeroheight / 21.0f * FIXED_TO_FLOAT(fovtan[viewssnum])) + +enum DrawSpanType +{ + DS_BASIC = 0x0000, + DS_COLORMAP = 0x0001, + DS_TRANSMAP = 0x0002, + DS_BRIGHTMAP = 0x0004, + DS_HOLES = 0x0008, + DS_RIPPLE = 0x0010, + DS_SPRITE = 0x0020, +}; + +template +static constexpr UINT8 R_GetSpanTranslated(drawspandata_t* ds, UINT8 col) +{ + if constexpr (Type & DrawSpanType::DS_COLORMAP) + { + return ds->translation[col]; + } + else + { + return col; + } +} + +template +static constexpr UINT8 R_GetSpanBrightmapped(drawspandata_t* ds, UINT8 *colormap, UINT32 bit, UINT8 col) +{ + col = R_GetSpanTranslated(ds, col); + + if constexpr (Type & DrawSpanType::DS_BRIGHTMAP) + { + UINT8 brightCol = 31; + + if constexpr (Type & DrawSpanType::DS_SPRITE) + { + UINT16 *spriteSource = reinterpret_cast(ds->brightmap); + UINT16 spriteCol = spriteSource[bit]; + + if (spriteCol & 0xFF00) + { + brightCol = (spriteCol & 0xFF); + } + } + else + { + brightCol = ds->brightmap[bit]; + } + + if (brightCol == BRIGHTPIXEL) + { + return ds->fullbright[col]; + } + } + + return colormap[col]; +} + +template +static constexpr UINT8 R_GetSpanTranslucent(drawspandata_t* ds, UINT8 *dsrc, UINT8 *colormap, UINT32 bit, UINT8 col) +{ + col = R_GetSpanBrightmapped(ds, colormap, bit, col); + + if constexpr (Type & DrawSpanType::DS_TRANSMAP) + { + return *(ds->transmap + (col << 8) + (*dsrc)); + } + else + { + return col; + } +} + +template +static constexpr UINT8 R_DrawSpanPixel(drawspandata_t* ds, UINT8 *dsrc, UINT8 *colormap, UINT32 bit) +{ + UINT8 col = 0; + + if constexpr (Type & DrawSpanType::DS_SPRITE) + { + UINT16 *spriteSource = reinterpret_cast(ds->source); + UINT16 spriteCol = spriteSource[bit]; + + if (spriteCol & 0xFF00) + { + col = (spriteCol & 0xFF); + } + else + { + return *dsrc; + } + } + else + { + col = ds->source[bit]; + } + + if constexpr (Type & DrawSpanType::DS_HOLES) + { + if (col == TRANSPARENTPIXEL) + { + return *dsrc; + } + } + + return R_GetSpanTranslucent(ds, dsrc, colormap, bit, col); +} + +/** \brief The R_DrawSpan_8 function + Draws the actual span. +*/ +template +static void R_DrawSpanTemplate(drawspandata_t* ds) +{ + fixed_t xposition; + fixed_t yposition; + fixed_t xstep, ystep; + UINT32 bit; + + UINT8 *dest; + UINT8 *dsrc; + + const UINT8 *deststop = screens[0] + vid.rowbytes * vid.height; + + size_t count = (ds->x2 - ds->x1 + 1); + size_t i; + + xposition = ds->xfrac; yposition = ds->yfrac; + xstep = ds->xstep; ystep = ds->ystep; + + if constexpr (Type & DS_RIPPLE) + { + yposition += ds->waterofs; + } + + // SoM: we only need 6 bits for the integer part (0 thru 63) so the rest + // can be used for the fraction part. This allows calculation of the memory address in the + // texture with two shifts, an OR and one AND. (see below) + // for texture sizes > 64 the amount of precision we can allow will decrease, but only by one + // bit per power of two (obviously) + // Ok, because I was able to eliminate the variable spot below, this function is now FASTER + // than the original span renderer. Whodathunkit? + xposition <<= ds->nflatshiftup; yposition <<= ds->nflatshiftup; + xstep <<= ds->nflatshiftup; ystep <<= ds->nflatshiftup; + + dest = ylookup[ds->y] + columnofs[ds->x1]; + if constexpr (Type & DS_RIPPLE) + { + dsrc = screens[1] + (ds->y + ds->bgofs) * vid.width + ds->x1; + } + else + { + dsrc = dest; + } + + if (dest+8 > deststop) + { + return; + } + + while (count >= 8) + { + // SoM: Why didn't I see this earlier? the spot variable is a waste now because we don't + // have the uber complicated math to calculate it now, so that was a memory write we didn't + // need! + + for (i = 0; i < 8; i++) + { + bit = (((UINT32)yposition >> ds->nflatyshift) & ds->nflatmask) | ((UINT32)xposition >> ds->nflatxshift); + + dest[i] = R_DrawSpanPixel(ds, dsrc, ds->colormap, bit); + + xposition += xstep; + yposition += ystep; + } + + dest += 8; + dsrc += 8; + + count -= 8; + } + + while (count-- && dest <= deststop) + { + bit = (((UINT32)yposition >> ds->nflatyshift) & ds->nflatmask) | ((UINT32)xposition >> ds->nflatxshift); + + *dest = R_DrawSpanPixel(ds, dsrc, ds->colormap, bit); + + dest++; + dsrc++; + + xposition += xstep; + yposition += ystep; + } +} + +// R_CalcTiltedLighting +// Exactly what it says on the tin. I wish I wasn't too lazy to explain things properly. +static void R_CalcTiltedLighting(INT32 *lightbuffer, INT32 x1, INT32 x2, fixed_t start, fixed_t end) +{ + // ZDoom uses a different lighting setup to us, and I couldn't figure out how to adapt their version + // of this function. Here's my own. + INT32 left = x1, right = x2; + fixed_t step = (end-start)/(x2 - x1 + 1); + INT32 i; + + // I wanna do some optimizing by checking for out-of-range segments on either side to fill in all at once, + // but I'm too bad at coding to not crash the game trying to do that. I guess this is fast enough for now... + + for (i = left; i <= right; i++) + { + lightbuffer[i] = (start += step) >> FRACBITS; + + if (lightbuffer[i] < 0) + { + lightbuffer[i] = 0; + } + else if (lightbuffer[i] >= MAXLIGHTSCALE) + { + lightbuffer[i] = MAXLIGHTSCALE-1; + } + } +} + +template +static void R_DrawTiltedSpanTemplate(drawspandata_t* ds) +{ + // x1, x2 = ds_x1, ds_x2 + int width = ds->x2 - ds->x1; + double iz, uz, vz; + UINT32 u, v; + int i; + + UINT8 *colormap; + UINT8 *dest; + UINT8 *dsrc; + + double startz, startu, startv; + double izstep, uzstep, vzstep; + double endz, endu, endv; + UINT32 stepu, stepv; + UINT32 bit; + INT32 tiltlighting[MAXVIDWIDTH]; + + INT32 x1 = ds->x1; + const INT32 nflatxshift = ds->nflatxshift; + const INT32 nflatyshift = ds->nflatyshift; + const INT32 nflatmask = ds->nflatmask; + + iz = ds->szp.z + ds->szp.y*(centery-ds->y) + ds->szp.x*(ds->x1-centerx); + + // Lighting is simple. It's just linear interpolation from start to end + if constexpr (!(Type & DS_SPRITE)) + { + float planelightfloat = PLANELIGHTFLOAT; + float lightstart, lightend; + + lightend = (iz + ds->szp.x*width) * planelightfloat; + lightstart = iz * planelightfloat; + + R_CalcTiltedLighting(tiltlighting, ds->x1, ds->x2, FLOAT_TO_FIXED(lightstart), FLOAT_TO_FIXED(lightend)); + //CONS_Printf("tilted lighting %f to %f (foc %f)\n", lightstart, lightend, focallengthf); + } + + uz = ds->sup.z + ds->sup.y*(centery-ds->y) + ds->sup.x*(ds->x1-centerx); + vz = ds->svp.z + ds->svp.y*(centery-ds->y) + ds->svp.x*(ds->x1-centerx); + + colormap = ds->colormap; + + dest = ylookup[ds->y] + columnofs[ds->x1]; + if constexpr (Type & DS_RIPPLE) + { + dsrc = screens[1] + (ds->y + ds->bgofs) * vid.width + ds->x1; + } + else + { + dsrc = dest; + } + +#if 0 // The "perfect" reference version of this routine. Pretty slow. + // Use it only to see how things are supposed to look. + i = 0; + do + { + double z = 1.f/iz; + u = (INT64)(uz*z); + v = (INT64)(vz*z); + + bit = ((v >> nflatyshift) & nflatmask) | (u >> nflatxshift); + if constexpr (!(Type & DS_SPRITE)) + { + colormap = planezlight[tiltlighting[ds->x1]] + (ds->colormap - colormaps); + } + *dest = R_DrawSpanPixel(ds, dsrc, colormap, bit); + dest++; + ds->x1++; + dsrc++; + iz += ds_szp->x; + uz += ds_sup->x; + vz += ds_svp->x; + } while (--width >= 0); +#else + startz = 1.f/iz; + startu = uz*startz; + startv = vz*startz; + + izstep = ds->szp.x * SPANSIZE; + uzstep = ds->sup.x * SPANSIZE; + vzstep = ds->svp.x * SPANSIZE; + //x1 = 0; + width++; + + while (width >= SPANSIZE) + { + iz += izstep; + uz += uzstep; + vz += vzstep; + + endz = 1.f/iz; + endu = uz*endz; + endv = vz*endz; + stepu = (INT64)((endu - startu) * INVSPAN); + stepv = (INT64)((endv - startv) * INVSPAN); + u = (INT64)(startu); + v = (INT64)(startv); + + x1 = ds->x1; + + for (i = 0; i < SPANSIZE; i++) + { + bit = (((v + stepv * i) >> nflatyshift) & nflatmask) | ((u + stepu * i) >> nflatxshift); + + if constexpr (!(Type & DS_SPRITE)) + { + colormap = ds->planezlight[tiltlighting[x1 + i]] + (ds->colormap - colormaps); + } + + dest[i] = R_DrawSpanPixel(ds, &dsrc[i], colormap, bit); + } + + ds->x1 += SPANSIZE; + dest += SPANSIZE; + dsrc += SPANSIZE; + startu = endu; + startv = endv; + width -= SPANSIZE; + } + + if (width > 0) + { + if (width == 1) + { + u = (INT64)(startu); + v = (INT64)(startv); + bit = ((v >> nflatyshift) & nflatmask) | (u >> nflatxshift); + if constexpr (!(Type & DS_SPRITE)) + { + colormap = ds->planezlight[tiltlighting[ds->x1]] + (ds->colormap - colormaps); + } + *dest = R_DrawSpanPixel(ds, dsrc, colormap, bit); + ds->x1++; + } + else + { + double left = width; + iz += ds->szp.x * left; + uz += ds->sup.x * left; + vz += ds->svp.x * left; + + endz = 1.f/iz; + endu = uz*endz; + endv = vz*endz; + left = 1.f/left; + stepu = (INT64)((endu - startu) * left); + stepv = (INT64)((endv - startv) * left); + u = (INT64)(startu); + v = (INT64)(startv); + + for (; width != 0; width--) + { + bit = ((v >> ds->nflatyshift) & ds->nflatmask) | (u >> ds->nflatxshift); + if constexpr (!(Type & DS_SPRITE)) + { + colormap = ds->planezlight[tiltlighting[ds->x1]] + (ds->colormap - colormaps); + } + *dest = R_DrawSpanPixel(ds, dsrc, colormap, bit); + dest++; + ds->x1++; + dsrc++; + u += stepu; + v += stepv; + } + } + } +#endif +} + +/** \brief The R_DrawSpan_NPO2 function + Draws the actual span. +*/ +template +static void R_DrawNPO2SpanTemplate(drawspandata_t* ds) +{ + fixed_t xposition; + fixed_t yposition; + fixed_t xstep, ystep; + fixed_t x, y; + fixed_t fixedwidth, fixedheight; + + UINT8 *dest; + UINT8 *dsrc; + const UINT8 *deststop = screens[0] + vid.rowbytes * vid.height; + + size_t count = (ds->x2 - ds->x1 + 1); + + xposition = ds->xfrac; yposition = ds->yfrac; + xstep = ds->xstep; ystep = ds->ystep; + + if constexpr (Type & DS_RIPPLE) + { + yposition += ds->waterofs; + } + + dest = ylookup[ds->y] + columnofs[ds->x1]; + + if constexpr (Type & DS_RIPPLE) + { + dsrc = screens[1] + (ds->y + ds->bgofs) * vid.width + ds->x1; + } + else + { + dsrc = dest; + } + + if (dest+8 > deststop) + return; + + fixedwidth = ds->flatwidth << FRACBITS; + fixedheight = ds->flatheight << FRACBITS; + + // Fix xposition and yposition if they are out of bounds. + if (xposition < 0) + xposition = fixedwidth - ((UINT32)(fixedwidth - xposition) % fixedwidth); + else if (xposition >= fixedwidth) + xposition %= fixedwidth; + if (yposition < 0) + yposition = fixedheight - ((UINT32)(fixedheight - yposition) % fixedheight); + else if (yposition >= fixedheight) + yposition %= fixedheight; + + while (count-- && dest <= deststop) + { + // The loops here keep the texture coordinates within the texture. + // They will rarely iterate multiple times, and are cheaper than a modulo operation, + // even if using libdivide. + if (xstep < 0) // These if statements are hopefully hoisted by the compiler to above this loop + while (xposition < 0) + xposition += fixedwidth; + else + while (xposition >= fixedwidth) + xposition -= fixedwidth; + if (ystep < 0) + while (yposition < 0) + yposition += fixedheight; + else + while (yposition >= fixedheight) + yposition -= fixedheight; + + x = (xposition >> FRACBITS); + y = (yposition >> FRACBITS); + + *dest = R_DrawSpanPixel(ds, dsrc, ds->colormap, ((y * ds->flatwidth) + x)); + dest++; + dsrc++; + + xposition += xstep; + yposition += ystep; + } +} + +/** \brief The R_DrawTiltedSpan_NPO2_8 function + Draw slopes! Holy sheit! +*/ +template +static void R_DrawTiltedNPO2SpanTemplate(drawspandata_t* ds) +{ + // x1, x2 = ds_x1, ds_x2 + int width = ds->x2 - ds->x1; + double iz, uz, vz; + UINT32 u, v; + int i; + + UINT8 *colormap; + UINT8 *dest; + UINT8 *dsrc; + + double startz, startu, startv; + double izstep, uzstep, vzstep; + double endz, endu, endv; + UINT32 stepu, stepv; + INT32 tiltlighting[MAXVIDWIDTH]; + + struct libdivide_u32_t x_divider = libdivide_u32_gen(ds->flatwidth); + struct libdivide_u32_t y_divider = libdivide_u32_gen(ds->flatheight); + + iz = ds->szp.z + ds->szp.y*(centery-ds->y) + ds->szp.x*(ds->x1-centerx); + + // Lighting is simple. It's just linear interpolation from start to end + if constexpr (!(Type & DS_SPRITE)) + { + float planelightfloat = PLANELIGHTFLOAT; + float lightstart, lightend; + + lightend = (iz + ds->szp.x*width) * planelightfloat; + lightstart = iz * planelightfloat; + + R_CalcTiltedLighting(tiltlighting, ds->x1, ds->x2, FLOAT_TO_FIXED(lightstart), FLOAT_TO_FIXED(lightend)); + //CONS_Printf("tilted lighting %f to %f (foc %f)\n", lightstart, lightend, focallengthf); + } + + uz = ds->sup.z + ds->sup.y*(centery-ds->y) + ds->sup.x*(ds->x1-centerx); + vz = ds->svp.z + ds->svp.y*(centery-ds->y) + ds->svp.x*(ds->x1-centerx); + + colormap = ds->colormap; + + dest = ylookup[ds->y] + columnofs[ds->x1]; + + if constexpr (Type & DS_RIPPLE) + { + dsrc = screens[1] + (ds->y + ds->bgofs) * vid.width + ds->x1; + } + else + { + dsrc = dest; + } + +#if 0 // The "perfect" reference version of this routine. Pretty slow. + // Use it only to see how things are supposed to look. + i = 0; + do + { + double z = 1.f/iz; + u = (INT64)(uz*z); + v = (INT64)(vz*z); + + if constexpr (!(Type & DS_SPRITE)) + { + colormap = ds->planezlight[tiltlighting[ds->x1++]] + (ds->colormap - colormaps); + } + + // Lactozilla: Non-powers-of-two + { + fixed_t x = (((fixed_t)u) >> FRACBITS); + fixed_t y = (((fixed_t)v) >> FRACBITS); + + // Carefully align all of my Friends. + if (x < 0) + x += (libdivide_u32_do((UINT32)(-x-1), &x_divider) + 1) * ds_flatwidth; + else + x -= libdivide_u32_do((UINT32)x, &x_divider) * ds_flatwidth; + if (y < 0) + y += (libdivide_u32_do((UINT32)(-y-1), &y_divider) + 1) * ds_flatheight; + else + y -= libdivide_u32_do((UINT32)y, &y_divider) * ds_flatheight; + + *dest = R_DrawSpanPixel(ds, dsrc, colormap, ((y * ds->flatwidth) + x)); + } + dest++; + dsrc++; + iz += ds_szp->x; + uz += ds_sup->x; + vz += ds_svp->x; + } while (--width >= 0); +#else + startz = 1.f/iz; + startu = uz*startz; + startv = vz*startz; + + izstep = ds->szp.x * SPANSIZE; + uzstep = ds->sup.x * SPANSIZE; + vzstep = ds->svp.x * SPANSIZE; + //x1 = 0; + width++; + + while (width >= SPANSIZE) + { + iz += izstep; + uz += uzstep; + vz += vzstep; + + endz = 1.f/iz; + endu = uz*endz; + endv = vz*endz; + stepu = (INT64)((endu - startu) * INVSPAN); + stepv = (INT64)((endv - startv) * INVSPAN); + u = (INT64)(startu); + v = (INT64)(startv); + + for (i = SPANSIZE-1; i >= 0; i--) + { + if constexpr (!(Type & DS_SPRITE)) + { + colormap = ds->planezlight[tiltlighting[ds->x1++]] + (ds->colormap - colormaps); + } + + // Lactozilla: Non-powers-of-two + { + fixed_t x = (((fixed_t)u) >> FRACBITS); + fixed_t y = (((fixed_t)v) >> FRACBITS); + + // Carefully align all of my Friends. + if (x < 0) + x += (libdivide_u32_do((UINT32)(-x-1), &x_divider) + 1) * ds->flatwidth; + else + x -= libdivide_u32_do((UINT32)x, &x_divider) * ds->flatwidth; + if (y < 0) + y += (libdivide_u32_do((UINT32)(-y-1), &y_divider) + 1) * ds->flatheight; + else + y -= libdivide_u32_do((UINT32)y, &y_divider) * ds->flatheight; + + *dest = R_DrawSpanPixel(ds, dsrc, colormap, ((y * ds->flatwidth) + x)); + } + dest++; + dsrc++; + u += stepu; + v += stepv; + } + startu = endu; + startv = endv; + width -= SPANSIZE; + } + if (width > 0) + { + if (width == 1) + { + u = (INT64)(startu); + v = (INT64)(startv); + + if constexpr (!(Type & DS_SPRITE)) + { + colormap = ds->planezlight[tiltlighting[ds->x1++]] + (ds->colormap - colormaps); + } + + // Lactozilla: Non-powers-of-two + { + fixed_t x = (((fixed_t)u) >> FRACBITS); + fixed_t y = (((fixed_t)v) >> FRACBITS); + + // Carefully align all of my Friends. + if (x < 0) + x += (libdivide_u32_do((UINT32)(-x-1), &x_divider) + 1) * ds->flatwidth; + else + x -= libdivide_u32_do((UINT32)x, &x_divider) * ds->flatwidth; + if (y < 0) + y += (libdivide_u32_do((UINT32)(-y-1), &y_divider) + 1) * ds->flatheight; + else + y -= libdivide_u32_do((UINT32)y, &y_divider) * ds->flatheight; + + *dest = R_DrawSpanPixel(ds, dsrc, colormap, ((y * ds->flatwidth) + x)); + } + } + else + { + double left = width; + iz += ds->szp.x * left; + uz += ds->sup.x * left; + vz += ds->svp.x * left; + + endz = 1.f/iz; + endu = uz*endz; + endv = vz*endz; + left = 1.f/left; + stepu = (INT64)((endu - startu) * left); + stepv = (INT64)((endv - startv) * left); + u = (INT64)(startu); + v = (INT64)(startv); + + for (; width != 0; width--) + { + if constexpr (!(Type & DS_SPRITE)) + { + colormap = ds->planezlight[tiltlighting[ds->x1++]] + (ds->colormap - colormaps); + } + + // Lactozilla: Non-powers-of-two + { + fixed_t x = (((fixed_t)u) >> FRACBITS); + fixed_t y = (((fixed_t)v) >> FRACBITS); + + // Carefully align all of my Friends. + if (x < 0) + x += (libdivide_u32_do((UINT32)(-x-1), &x_divider) + 1) * ds->flatwidth; + else + x -= libdivide_u32_do((UINT32)x, &x_divider) * ds->flatwidth; + if (y < 0) + y += (libdivide_u32_do((UINT32)(-y-1), &y_divider) + 1) * ds->flatheight; + else + y -= libdivide_u32_do((UINT32)y, &y_divider) * ds->flatheight; + + *dest = R_DrawSpanPixel(ds, dsrc, colormap, ((y * ds->flatwidth) + x)); + } + dest++; + dsrc++; + u += stepu; + v += stepv; + } + } + } +#endif +} + +#define DEFINE_SPAN_FUNC(name, flags, template) \ + void name(drawspandata_t* ds) \ + { \ + ZoneScoped; \ + constexpr DrawSpanType opt = static_cast(flags); \ + template(ds); \ + } + +#define DEFINE_SPAN_COMBO(name, flags) \ + DEFINE_SPAN_FUNC(name, flags, R_DrawSpanTemplate) \ + DEFINE_SPAN_FUNC(name ## _Tilted, flags, R_DrawTiltedSpanTemplate) \ + DEFINE_SPAN_FUNC(name ## _NPO2, flags, R_DrawNPO2SpanTemplate) \ + DEFINE_SPAN_FUNC(name ## _Tilted_NPO2, flags, R_DrawTiltedNPO2SpanTemplate) \ + DEFINE_SPAN_FUNC(name ## _Brightmap, flags|DS_BRIGHTMAP, R_DrawSpanTemplate) \ + DEFINE_SPAN_FUNC(name ## _Tilted_Brightmap, flags|DS_BRIGHTMAP, R_DrawTiltedSpanTemplate) \ + DEFINE_SPAN_FUNC(name ## _Brightmap_NPO2, flags|DS_BRIGHTMAP, R_DrawNPO2SpanTemplate) \ + DEFINE_SPAN_FUNC(name ## _Tilted_Brightmap_NPO2, flags|DS_BRIGHTMAP, R_DrawTiltedNPO2SpanTemplate) + +DEFINE_SPAN_COMBO(R_DrawSpan, DS_BASIC) +DEFINE_SPAN_COMBO(R_DrawTranslucentSpan, DS_TRANSMAP) +DEFINE_SPAN_COMBO(R_DrawSplat, DS_HOLES) +DEFINE_SPAN_COMBO(R_DrawTranslucentSplat, DS_TRANSMAP|DS_HOLES) +DEFINE_SPAN_COMBO(R_DrawFloorSprite, DS_COLORMAP|DS_SPRITE) +DEFINE_SPAN_COMBO(R_DrawTranslucentFloorSprite, DS_COLORMAP|DS_TRANSMAP|DS_SPRITE) +DEFINE_SPAN_COMBO(R_DrawTranslucentWaterSpan, DS_TRANSMAP|DS_RIPPLE) + +void R_DrawFogSpan(drawspandata_t* ds) +{ + ZoneScoped; + + UINT8 *colormap; + UINT8 *dest; + + size_t count; + + colormap = ds->colormap; + + //dest = ylookup[ds_y] + columnofs[ds_x1]; + dest = &topleft[ds->y *vid.width + ds->x1]; + + count = ds->x2 - ds->x1 + 1; + + while (count >= 4) + { + dest[0] = colormap[dest[0]]; + dest[1] = colormap[dest[1]]; + dest[2] = colormap[dest[2]]; + dest[3] = colormap[dest[3]]; + + dest += 4; + count -= 4; + } + + while (count--) + { + *dest = colormap[*dest]; + dest++; + } +} + +void R_DrawFogSpan_Tilted(drawspandata_t* ds) +{ + ZoneScoped; + + // x1, x2 = ds_x1, ds_x2 + int width = ds->x2 - ds->x1; + double iz = ds->szp.z + ds->szp.y*(centery-ds->y) + ds->szp.x*(ds->x1-centerx); + INT32 tiltlighting[MAXVIDWIDTH]; + + UINT8 *dest = ylookup[ds->y] + columnofs[ds->x1]; + + // Lighting is simple. It's just linear interpolation from start to end + { + float planelightfloat = PLANELIGHTFLOAT; + float lightstart, lightend; + + lightend = (iz + ds->szp.x*width) * planelightfloat; + lightstart = iz * planelightfloat; + + R_CalcTiltedLighting(tiltlighting, ds->x1, ds->x2, FLOAT_TO_FIXED(lightstart), FLOAT_TO_FIXED(lightend)); + //CONS_Printf("tilted lighting %f to %f (foc %f)\n", lightstart, lightend, focallengthf); + } + + do + { + UINT8 *colormap = ds->planezlight[tiltlighting[ds->x1++]] + (ds->colormap - colormaps); + *dest = colormap[*dest]; + dest++; + } + while (--width >= 0); +} + +void R_DrawSpan_Flat(drawspandata_t* ds) +{ + ZoneScoped; + + UINT8 *dest = ylookup[ds->y] + columnofs[ds->x1]; + memset(dest, ds->colormap[ds->r8_flatcolor], (ds->x2 - ds->x1) + 1); +} + +void R_DrawTiltedSpan_Flat(drawspandata_t* ds) +{ + ZoneScoped; + + // x1, x2 = ds_x1, ds_x2 + int width = ds->x2 - ds->x1; + double iz = ds->szp.z + ds->szp.y*(centery-ds->y) + ds->szp.x*(ds->x1-centerx); + INT32 tiltlighting[MAXVIDWIDTH]; + + UINT8 *dest = ylookup[ds->y]; + + // Lighting is simple. It's just linear interpolation from start to end + { + float planelightfloat = PLANELIGHTFLOAT; + float lightstart, lightend; + + lightend = (iz + ds->szp.x*width) * planelightfloat; + lightstart = iz * planelightfloat; + + R_CalcTiltedLighting(tiltlighting, ds->x1, ds->x2, FLOAT_TO_FIXED(lightstart), FLOAT_TO_FIXED(lightend)); + //CONS_Printf("tilted lighting %f to %f (foc %f)\n", lightstart, lightend, focallengthf); + } + + while (ds->x1 <= ds->x2) + { + dest[ds->x1] = ds->planezlight[tiltlighting[ds->x1]][ds->r8_flatcolor]; + ds->x1++; + } +} diff --git a/src/r_plane.cpp b/src/r_plane.cpp index 36ba9d1b5..b85cafbf9 100644 --- a/src/r_plane.cpp +++ b/src/r_plane.cpp @@ -962,7 +962,7 @@ void R_DrawSinglePlane(drawspandata_t *ds, visplane_t *pl, boolean allow_paralle { dc.yl = pl->top[dc.x]; dc.yh = pl->bottom[dc.x]; - R_DrawColumn_Flat_8(&dc); + R_DrawColumn_Flat(&dc); } } else @@ -1202,6 +1202,9 @@ void R_DrawSinglePlane(drawspandata_t *ds, visplane_t *pl, boolean allow_paralle case SPANDRAWFUNC_SPLAT: spanfunctype = SPANDRAWFUNC_TILTEDSPLAT; break; + case SPANDRAWFUNC_FOG: + spanfunctype = SPANDRAWFUNC_TILTEDFOG; + break; default: spanfunctype = SPANDRAWFUNC_TILTED; break; @@ -1240,77 +1243,6 @@ void R_DrawSinglePlane(drawspandata_t *ds, visplane_t *pl, boolean allow_paralle for (x = pl->minx; x <= stop; x++) R_MakeSpans(mapfunc, spanfunc, ds, x, pl->top[x-1], pl->bottom[x-1], pl->top[x], pl->bottom[x], allow_parallel); - -/* -QUINCUNX anti-aliasing technique (sort of) - -Normally, Quincunx antialiasing staggers pixels -in a 5-die pattern like so: - -o o - o -o o - -To simulate this, we offset the plane by -FRACUNIT/4 in each direction, and draw -at 50% translucency. The result is -a 'smoothing' of the texture while -using the palette colors. -*/ -#ifdef QUINCUNX - if (spanfunc == spanfuncs[BASEDRAWFUNC]) - { - INT32 i; - ds_transmap = R_GetTranslucencyTable(tr_trans50); - spanfunc = spanfuncs[SPANDRAWFUNC_TRANS]; - for (i=0; i<4; i++) - { - xoffs = pl->xoffs; - yoffs = pl->yoffs; - - switch(i) - { - case 0: - xoffs -= FRACUNIT/4; - yoffs -= FRACUNIT/4; - break; - case 1: - xoffs -= FRACUNIT/4; - yoffs += FRACUNIT/4; - break; - case 2: - xoffs += FRACUNIT/4; - yoffs -= FRACUNIT/4; - break; - case 3: - xoffs += FRACUNIT/4; - yoffs += FRACUNIT/4; - break; - } - ds->planeheight = abs(pl->height - pl->viewz); - - if (light >= LIGHTLEVELS) - light = LIGHTLEVELS-1; - - if (light < 0) - light = 0; - - planezlight = zlight[light]; - - // set the maximum value for unsigned - pl->top[pl->maxx+1] = 0xffff; - pl->top[pl->minx-1] = 0xffff; - pl->bottom[pl->maxx+1] = 0x0000; - pl->bottom[pl->minx-1] = 0x0000; - - stop = pl->maxx + 1; - - for (x = pl->minx; x <= stop; x++) - R_MakeSpans(mapfunc, x, pl->top[x-1], pl->bottom[x-1], - pl->top[x], pl->bottom[x]); - } - } -#endif } void R_PlaneBounds(visplane_t *plane) diff --git a/src/r_segs.cpp b/src/r_segs.cpp index c349ca712..b376605de 100644 --- a/src/r_segs.cpp +++ b/src/r_segs.cpp @@ -688,7 +688,7 @@ void R_RenderMaskedSegRange(drawseg_t *drawseg, INT32 x1, INT32 x2) if (debug) { - colfunc = R_DrawColumn_Flat_8; + colfunc = R_DrawColumn_Flat; dc->r8_flatcolor = R_DebugLineColor(ldef); R_RenderMaskedSegLoopDebug(dc, drawseg, x1, x2, colfunc_2s); } diff --git a/src/r_splats.c b/src/r_splats.c index 0607b4832..0c2511b6e 100644 --- a/src/r_splats.c +++ b/src/r_splats.c @@ -30,20 +30,8 @@ static void prepare_rastertab(void); static void R_RasterizeFloorSplat(floorsplat_t *pSplat, vector2_t *verts, vissprite_t *vis); -#ifdef USEASM -void ASMCALL rasterize_segment_tex_asm(INT32 x1, INT32 y1, INT32 x2, INT32 y2, INT32 tv1, INT32 tv2, INT32 tc, INT32 dir); -#endif - static void rasterize_segment_tex(INT32 x1, INT32 y1, INT32 x2, INT32 y2, INT32 tv1, INT32 tv2, INT32 tc, INT32 dir) { -#ifdef USEASM - if (R_ASM) - { - rasterize_segment_tex_asm(x1, y1, x2, y2, tv1, tv2, tc, dir); - return; - } - else -#endif { fixed_t xs, xe, count; fixed_t dx0, dx1; diff --git a/src/screen.c b/src/screen.c index 889af63cb..5d223a0fb 100644 --- a/src/screen.c +++ b/src/screen.c @@ -72,126 +72,116 @@ UINT8 *scr_borderpatch; // flat used to fill the reduced view borders set at ST_ // ========================================================================= -// Short and Tall sky drawer, for the current color mode -void (*walldrawerfunc)(void); - -boolean R_ASM = true; -boolean R_486 = false; -boolean R_586 = false; -boolean R_MMX = false; -boolean R_SSE = false; -boolean R_3DNow = false; -boolean R_MMXExt = false; -boolean R_SSE2 = false; - void SCR_SetDrawFuncs(void) { // - // setup the right draw routines for either 8bpp or 16bpp + // setup the right draw routines // - if (true)//vid.bpp == 1) //Always run in 8bpp. todo: remove all 16bpp code? - { - colfuncs[BASEDRAWFUNC] = R_DrawColumn_8; - spanfuncs[BASEDRAWFUNC] = R_DrawSpan_8; - colfuncs[COLDRAWFUNC_FUZZY] = R_DrawTranslucentColumn_8; - colfuncs[COLDRAWFUNC_TRANS] = R_DrawTranslatedColumn_8; - colfuncs[COLDRAWFUNC_SHADE] = R_DrawShadeColumn_8; - colfuncs[COLDRAWFUNC_SHADOWED] = R_DrawColumnShadowed_8; - colfuncs[COLDRAWFUNC_TRANSTRANS] = R_DrawTranslatedTranslucentColumn_8; - colfuncs[COLDRAWFUNC_TWOSMULTIPATCH] = R_Draw2sMultiPatchColumn_8; - colfuncs[COLDRAWFUNC_TWOSMULTIPATCHTRANS] = R_Draw2sMultiPatchTranslucentColumn_8; - colfuncs[COLDRAWFUNC_FOG] = R_DrawFogColumn_8; - colfuncs[COLDRAWFUNC_DROPSHADOW] = R_DrawDropShadowColumn_8; + colfuncs[BASEDRAWFUNC] = R_DrawColumn; + colfuncs[COLDRAWFUNC_FUZZY] = R_DrawTranslucentColumn; + colfuncs[COLDRAWFUNC_TRANS] = R_DrawTranslatedColumn; + colfuncs[COLDRAWFUNC_SHADOWED] = R_DrawColumnShadowed; + colfuncs[COLDRAWFUNC_TRANSTRANS] = R_DrawTranslatedTranslucentColumn; + colfuncs[COLDRAWFUNC_TWOSMULTIPATCH] = R_Draw2sMultiPatchColumn; + colfuncs[COLDRAWFUNC_TWOSMULTIPATCHTRANS] = R_Draw2sMultiPatchTranslucentColumn; + colfuncs[COLDRAWFUNC_FOG] = R_DrawFogColumn; + colfuncs[COLDRAWFUNC_DROPSHADOW] = R_DrawDropShadowColumn; - spanfuncs[SPANDRAWFUNC_TRANS] = R_DrawTranslucentSpan_8; - spanfuncs[SPANDRAWFUNC_TILTED] = R_DrawTiltedSpan_8; - spanfuncs[SPANDRAWFUNC_TILTEDTRANS] = R_DrawTiltedTranslucentSpan_8; - spanfuncs[SPANDRAWFUNC_SPLAT] = R_DrawSplat_8; - spanfuncs[SPANDRAWFUNC_TRANSSPLAT] = R_DrawTranslucentSplat_8; - spanfuncs[SPANDRAWFUNC_TILTEDSPLAT] = R_DrawTiltedSplat_8; - spanfuncs[SPANDRAWFUNC_SPRITE] = R_DrawFloorSprite_8; - spanfuncs[SPANDRAWFUNC_TRANSSPRITE] = R_DrawTranslucentFloorSprite_8; - spanfuncs[SPANDRAWFUNC_TILTEDSPRITE] = R_DrawTiltedFloorSprite_8; - spanfuncs[SPANDRAWFUNC_TILTEDTRANSSPRITE] = R_DrawTiltedTranslucentFloorSprite_8; - spanfuncs[SPANDRAWFUNC_WATER] = R_DrawTranslucentWaterSpan_8; - spanfuncs[SPANDRAWFUNC_TILTEDWATER] = R_DrawTiltedTranslucentWaterSpan_8; - spanfuncs[SPANDRAWFUNC_FOG] = R_DrawFogSpan_8; + colfuncs_bm[BASEDRAWFUNC] = R_DrawColumn_Brightmap; + colfuncs_bm[COLDRAWFUNC_FUZZY] = R_DrawTranslucentColumn_Brightmap; + colfuncs_bm[COLDRAWFUNC_TRANS] = R_DrawTranslatedColumn_Brightmap; + colfuncs_bm[COLDRAWFUNC_SHADOWED] = R_DrawColumnShadowed_Brightmap; + colfuncs_bm[COLDRAWFUNC_TRANSTRANS] = R_DrawTranslatedTranslucentColumn_Brightmap; + colfuncs_bm[COLDRAWFUNC_TWOSMULTIPATCH] = R_Draw2sMultiPatchColumn_Brightmap; + colfuncs_bm[COLDRAWFUNC_TWOSMULTIPATCHTRANS] = R_Draw2sMultiPatchTranslucentColumn_Brightmap; + colfuncs_bm[COLDRAWFUNC_FOG] = NULL; // Not needed + colfuncs_bm[COLDRAWFUNC_DROPSHADOW] = NULL; // Not needed - // Lactozilla: Non-powers-of-two - spanfuncs_npo2[BASEDRAWFUNC] = R_DrawSpan_NPO2_8; - spanfuncs_npo2[SPANDRAWFUNC_TRANS] = R_DrawTranslucentSpan_NPO2_8; - spanfuncs_npo2[SPANDRAWFUNC_TILTED] = R_DrawTiltedSpan_NPO2_8; - spanfuncs_npo2[SPANDRAWFUNC_TILTEDTRANS] = R_DrawTiltedTranslucentSpan_NPO2_8; - spanfuncs_npo2[SPANDRAWFUNC_SPLAT] = R_DrawSplat_NPO2_8; - spanfuncs_npo2[SPANDRAWFUNC_TRANSSPLAT] = R_DrawTranslucentSplat_NPO2_8; - spanfuncs_npo2[SPANDRAWFUNC_TILTEDSPLAT] = R_DrawTiltedSplat_NPO2_8; - spanfuncs_npo2[SPANDRAWFUNC_SPRITE] = R_DrawFloorSprite_NPO2_8; - spanfuncs_npo2[SPANDRAWFUNC_TRANSSPRITE] = R_DrawTranslucentFloorSprite_NPO2_8; - spanfuncs_npo2[SPANDRAWFUNC_TILTEDSPRITE] = R_DrawTiltedFloorSprite_NPO2_8; - spanfuncs_npo2[SPANDRAWFUNC_TILTEDTRANSSPRITE] = R_DrawTiltedTranslucentFloorSprite_NPO2_8; - spanfuncs_npo2[SPANDRAWFUNC_WATER] = R_DrawTranslucentWaterSpan_NPO2_8; - spanfuncs_npo2[SPANDRAWFUNC_TILTEDWATER] = R_DrawTiltedTranslucentWaterSpan_NPO2_8; - spanfuncs_npo2[SPANDRAWFUNC_FOG] = NULL; // Not needed + spanfuncs[BASEDRAWFUNC] = R_DrawSpan; + spanfuncs[SPANDRAWFUNC_TRANS] = R_DrawTranslucentSpan; + spanfuncs[SPANDRAWFUNC_TILTED] = R_DrawSpan_Tilted; + spanfuncs[SPANDRAWFUNC_TILTEDTRANS] = R_DrawTranslucentSpan_Tilted; + spanfuncs[SPANDRAWFUNC_SPLAT] = R_DrawSplat; + spanfuncs[SPANDRAWFUNC_TRANSSPLAT] = R_DrawTranslucentSplat; + spanfuncs[SPANDRAWFUNC_TILTEDSPLAT] = R_DrawSplat_Tilted; + spanfuncs[SPANDRAWFUNC_SPRITE] = R_DrawFloorSprite; + spanfuncs[SPANDRAWFUNC_TRANSSPRITE] = R_DrawTranslucentFloorSprite; + spanfuncs[SPANDRAWFUNC_TILTEDSPRITE] = R_DrawFloorSprite_Tilted; + spanfuncs[SPANDRAWFUNC_TILTEDTRANSSPRITE] = R_DrawTranslucentFloorSprite_Tilted; + spanfuncs[SPANDRAWFUNC_WATER] = R_DrawTranslucentWaterSpan; + spanfuncs[SPANDRAWFUNC_TILTEDWATER] = R_DrawTranslucentWaterSpan_Tilted; + spanfuncs[SPANDRAWFUNC_FOG] = R_DrawFogSpan; + spanfuncs[SPANDRAWFUNC_TILTEDFOG] = R_DrawFogSpan_Tilted; - // Debugging - highlight surfaces in flat colors - spanfuncs_flat[BASEDRAWFUNC] = R_DrawSpan_Flat_8; - spanfuncs_flat[SPANDRAWFUNC_TRANS] = R_DrawSpan_Flat_8; - spanfuncs_flat[SPANDRAWFUNC_TILTED] = R_DrawTiltedSpan_Flat_8; - spanfuncs_flat[SPANDRAWFUNC_TILTEDTRANS] = R_DrawTiltedSpan_Flat_8; - spanfuncs_flat[SPANDRAWFUNC_SPLAT] = R_DrawSpan_Flat_8; - spanfuncs_flat[SPANDRAWFUNC_TRANSSPLAT] = R_DrawSpan_Flat_8; - spanfuncs_flat[SPANDRAWFUNC_TILTEDSPLAT] = R_DrawTiltedSpan_Flat_8; - spanfuncs_flat[SPANDRAWFUNC_SPRITE] = R_DrawSpan_Flat_8; - spanfuncs_flat[SPANDRAWFUNC_TRANSSPRITE] = R_DrawSpan_Flat_8; - spanfuncs_flat[SPANDRAWFUNC_TILTEDSPRITE] = R_DrawTiltedSpan_Flat_8; - spanfuncs_flat[SPANDRAWFUNC_TILTEDTRANSSPRITE] = R_DrawTiltedSpan_Flat_8; - spanfuncs_flat[SPANDRAWFUNC_WATER] = R_DrawSpan_Flat_8; - spanfuncs_flat[SPANDRAWFUNC_TILTEDWATER] = R_DrawTiltedSpan_Flat_8; - spanfuncs_flat[SPANDRAWFUNC_FOG] = R_DrawSpan_Flat_8; // Not needed + spanfuncs_bm[BASEDRAWFUNC] = R_DrawSpan_Brightmap; + spanfuncs_bm[SPANDRAWFUNC_TRANS] = R_DrawTranslucentSpan_Brightmap; + spanfuncs_bm[SPANDRAWFUNC_TILTED] = R_DrawSpan_Tilted_Brightmap; + spanfuncs_bm[SPANDRAWFUNC_TILTEDTRANS] = R_DrawTranslucentSpan_Tilted_Brightmap; + spanfuncs_bm[SPANDRAWFUNC_SPLAT] = R_DrawSplat_Brightmap; + spanfuncs_bm[SPANDRAWFUNC_TRANSSPLAT] = R_DrawTranslucentSplat_Brightmap; + spanfuncs_bm[SPANDRAWFUNC_TILTEDSPLAT] = R_DrawSplat_Tilted_Brightmap; + spanfuncs_bm[SPANDRAWFUNC_SPRITE] = R_DrawFloorSprite_Brightmap; + spanfuncs_bm[SPANDRAWFUNC_TRANSSPRITE] = R_DrawTranslucentFloorSprite_Brightmap; + spanfuncs_bm[SPANDRAWFUNC_TILTEDSPRITE] = R_DrawFloorSprite_Tilted_Brightmap; + spanfuncs_bm[SPANDRAWFUNC_TILTEDTRANSSPRITE] = R_DrawTranslucentFloorSprite_Tilted_Brightmap; + spanfuncs_bm[SPANDRAWFUNC_WATER] = R_DrawTranslucentWaterSpan_Brightmap; + spanfuncs_bm[SPANDRAWFUNC_TILTEDWATER] = R_DrawTranslucentWaterSpan_Tilted_Brightmap; + spanfuncs_bm[SPANDRAWFUNC_FOG] = NULL; // Not needed + spanfuncs_bm[SPANDRAWFUNC_TILTEDFOG] = NULL; // Not needed -#if (defined(RUSEASM) && defined(USE_COL_SPAN_ASM)) - if (R_ASM) - { - if (R_MMX) - { - colfuncs_asm[BASEDRAWFUNC] = R_DrawColumn_8_MMX; - //colfuncs_asm[COLDRAWFUNC_SHADE] = R_DrawShadeColumn_8_ASM; - //colfuncs_asm[COLDRAWFUNC_FUZZY] = R_DrawTranslucentColumn_8_ASM; - colfuncs_asm[COLDRAWFUNC_TWOSMULTIPATCH] = R_Draw2sMultiPatchColumn_8_MMX; - spanfuncs_asm[BASEDRAWFUNC] = R_DrawSpan_8_MMX; - } - else - { - colfuncs_asm[BASEDRAWFUNC] = R_DrawColumn_8_ASM; - //colfuncs_asm[COLDRAWFUNC_SHADE] = R_DrawShadeColumn_8_ASM; - //colfuncs_asm[COLDRAWFUNC_FUZZY] = R_DrawTranslucentColumn_8_ASM; - colfuncs_asm[COLDRAWFUNC_TWOSMULTIPATCH] = R_Draw2sMultiPatchColumn_8_ASM; - } - } -#endif + // Lactozilla: Non-powers-of-two + spanfuncs_npo2[BASEDRAWFUNC] = R_DrawSpan_NPO2; + spanfuncs_npo2[SPANDRAWFUNC_TRANS] = R_DrawTranslucentSpan_NPO2; + spanfuncs_npo2[SPANDRAWFUNC_TILTED] = R_DrawSpan_Tilted_NPO2; + spanfuncs_npo2[SPANDRAWFUNC_TILTEDTRANS] = R_DrawTranslucentSpan_Tilted_NPO2; + spanfuncs_npo2[SPANDRAWFUNC_SPLAT] = R_DrawSplat_NPO2; + spanfuncs_npo2[SPANDRAWFUNC_TRANSSPLAT] = R_DrawTranslucentSplat_NPO2; + spanfuncs_npo2[SPANDRAWFUNC_TILTEDSPLAT] = R_DrawSplat_Tilted_NPO2; + spanfuncs_npo2[SPANDRAWFUNC_SPRITE] = R_DrawFloorSprite_NPO2; + spanfuncs_npo2[SPANDRAWFUNC_TRANSSPRITE] = R_DrawTranslucentFloorSprite_NPO2; + spanfuncs_npo2[SPANDRAWFUNC_TILTEDSPRITE] = R_DrawFloorSprite_Tilted_NPO2; + spanfuncs_npo2[SPANDRAWFUNC_TILTEDTRANSSPRITE] = R_DrawTranslucentFloorSprite_Tilted_NPO2; + spanfuncs_npo2[SPANDRAWFUNC_WATER] = R_DrawTranslucentWaterSpan_NPO2; + spanfuncs_npo2[SPANDRAWFUNC_TILTEDWATER] = R_DrawTranslucentWaterSpan_Tilted_NPO2; + spanfuncs_npo2[SPANDRAWFUNC_FOG] = NULL; // Not needed + spanfuncs_npo2[SPANDRAWFUNC_TILTEDFOG] = NULL; // Not needed - R_SetColumnFunc(BASEDRAWFUNC, false); - R_SetSpanFunc(BASEDRAWFUNC, false, false); - } -/* else if (vid.bpp > 1) - { - I_OutputMsg("using highcolor mode\n"); - spanfunc = basespanfunc = R_DrawSpan_16; - transcolfunc = R_DrawTranslatedColumn_16; - transtransfunc = R_DrawTranslucentColumn_16; // No 16bit operation for this function + spanfuncs_bm_npo2[BASEDRAWFUNC] = R_DrawSpan_Brightmap_NPO2; + spanfuncs_bm_npo2[SPANDRAWFUNC_TRANS] = R_DrawTranslucentSpan_Brightmap_NPO2; + spanfuncs_bm_npo2[SPANDRAWFUNC_TILTED] = R_DrawSpan_Tilted_Brightmap_NPO2; + spanfuncs_bm_npo2[SPANDRAWFUNC_TILTEDTRANS] = R_DrawTranslucentSpan_Tilted_Brightmap_NPO2; + spanfuncs_bm_npo2[SPANDRAWFUNC_SPLAT] = R_DrawSplat_Brightmap_NPO2; + spanfuncs_bm_npo2[SPANDRAWFUNC_TRANSSPLAT] = R_DrawTranslucentSplat_Brightmap_NPO2; + spanfuncs_bm_npo2[SPANDRAWFUNC_TILTEDSPLAT] = R_DrawSplat_Tilted_Brightmap_NPO2; + spanfuncs_bm_npo2[SPANDRAWFUNC_SPRITE] = R_DrawFloorSprite_Brightmap_NPO2; + spanfuncs_bm_npo2[SPANDRAWFUNC_TRANSSPRITE] = R_DrawTranslucentFloorSprite_Brightmap_NPO2; + spanfuncs_bm_npo2[SPANDRAWFUNC_TILTEDSPRITE] = R_DrawFloorSprite_Tilted_Brightmap_NPO2; + spanfuncs_bm_npo2[SPANDRAWFUNC_TILTEDTRANSSPRITE] = R_DrawTranslucentFloorSprite_Tilted_Brightmap_NPO2; + spanfuncs_bm_npo2[SPANDRAWFUNC_WATER] = R_DrawTranslucentWaterSpan_Brightmap_NPO2; + spanfuncs_bm_npo2[SPANDRAWFUNC_TILTEDWATER] = R_DrawTranslucentWaterSpan_Tilted_Brightmap_NPO2; + spanfuncs_bm_npo2[SPANDRAWFUNC_FOG] = NULL; // Not needed + spanfuncs_bm_npo2[SPANDRAWFUNC_TILTEDFOG] = NULL; // Not needed - colfunc = basecolfunc = R_DrawColumn_16; - shadecolfunc = NULL; // detect error if used somewhere.. - fuzzcolfunc = R_DrawTranslucentColumn_16; - walldrawerfunc = R_DrawWallColumn_16; - }*/ - else - I_Error("unknown bytes per pixel mode %d\n", vid.bpp); -/* - if (SCR_IsAspectCorrect(vid.width, vid.height)) - CONS_Alert(CONS_WARNING, M_GetText("Resolution is not aspect-correct!\nUse a multiple of %dx%d\n"), BASEVIDWIDTH, BASEVIDHEIGHT); -*/ + // Debugging - highlight surfaces in flat colors + spanfuncs_flat[BASEDRAWFUNC] = R_DrawSpan_Flat; + spanfuncs_flat[SPANDRAWFUNC_TRANS] = R_DrawSpan_Flat; + spanfuncs_flat[SPANDRAWFUNC_TILTED] = R_DrawTiltedSpan_Flat; + spanfuncs_flat[SPANDRAWFUNC_TILTEDTRANS] = R_DrawTiltedSpan_Flat; + spanfuncs_flat[SPANDRAWFUNC_SPLAT] = R_DrawSpan_Flat; + spanfuncs_flat[SPANDRAWFUNC_TRANSSPLAT] = R_DrawSpan_Flat; + spanfuncs_flat[SPANDRAWFUNC_TILTEDSPLAT] = R_DrawTiltedSpan_Flat; + spanfuncs_flat[SPANDRAWFUNC_SPRITE] = R_DrawSpan_Flat; + spanfuncs_flat[SPANDRAWFUNC_TRANSSPRITE] = R_DrawSpan_Flat; + spanfuncs_flat[SPANDRAWFUNC_TILTEDSPRITE] = R_DrawTiltedSpan_Flat; + spanfuncs_flat[SPANDRAWFUNC_TILTEDTRANSSPRITE] = R_DrawTiltedSpan_Flat; + spanfuncs_flat[SPANDRAWFUNC_WATER] = R_DrawSpan_Flat; + spanfuncs_flat[SPANDRAWFUNC_TILTEDWATER] = R_DrawTiltedSpan_Flat; + spanfuncs_flat[SPANDRAWFUNC_FOG] = R_DrawSpan_Flat; + spanfuncs_flat[SPANDRAWFUNC_TILTEDFOG] = R_DrawTiltedSpan_Flat; + + R_SetColumnFunc(BASEDRAWFUNC, false); + R_SetSpanFunc(BASEDRAWFUNC, false, false); } void R_SetColumnFunc(size_t id, boolean brightmapped) @@ -202,14 +192,12 @@ void R_SetColumnFunc(size_t id, boolean brightmapped) if (debugrender_highlight != 0) { - colfunc = R_DrawColumn_Flat_8; + colfunc = R_DrawColumn_Flat; } -#ifdef USE_COL_SPAN_ASM - else if (colfuncs_asm[id] != NULL && brightmapped == false) + else if (brightmapped == true && colfuncs_bm[id] != NULL) { - colfunc = colfuncs_asm[id]; + colfunc = colfuncs_bm[id]; } -#endif else { colfunc = colfuncs[id]; @@ -225,19 +213,27 @@ void R_SetSpanFunc(size_t id, boolean npo2, boolean brightmapped) return; } - if (spanfuncs_npo2[id] != NULL && npo2 == true) + if (brightmapped == true && spanfuncs_bm[id] != NULL) { - spanfunc = spanfuncs_npo2[id]; + if (npo2 == true && spanfuncs_bm_npo2[id] != NULL) + { + spanfunc = spanfuncs_bm_npo2[id]; + } + else + { + spanfunc = spanfuncs_bm[id]; + } } -#ifdef USE_COL_SPAN_ASM - else if (spanfuncs_asm[id] != NULL && brightmapped == false) - { - spanfunc = spanfuncs_asm[id]; - } -#endif else { - spanfunc = spanfuncs[id]; + if (npo2 == true && spanfuncs_npo2[id] != NULL) + { + spanfunc = spanfuncs_npo2[id]; + } + else + { + spanfunc = spanfuncs[id]; + } } } @@ -267,7 +263,7 @@ boolean R_CheckColumnFunc(size_t id) for (i = 0; i < COLDRAWFUNC_MAX; i++) { - if (colfunc == colfuncs[id] || colfunc == colfuncs_asm[id]) + if (colfunc == colfuncs[id] || colfunc == colfuncs_bm[id]) { return true; }