From 24132a9dcda2fa1c37ae735f53a8cf30a2369601 Mon Sep 17 00:00:00 2001
From: Sally Coolatta <tehrealsalt@gmail.com>
Date: Tue, 26 Dec 2023 03:00:05 -0500
Subject: [PATCH] Use C++ templates for DrawColumn/Span

Two reasons:
- Makes it more straight-forward to add brightmaps to the non-power-of-two rendering functions.
- Made it easier to split off brightmap rendering. Hopefully improves performance, but I haven't thoroughly tested this.
---
 src/CMakeLists.txt           |    2 +-
 src/libdivide.h              | 2484 +++++++++++++++++++++++---------
 src/{r_draw.c => r_draw.cpp} |   52 +-
 src/r_draw.h                 |  175 +--
 src/r_draw16.c               |  214 ---
 src/r_draw8.c                | 2564 ----------------------------------
 src/r_draw8_flat.c           |   80 --
 src/r_draw8_npo2.c           | 1618 ---------------------
 src/r_draw_column.cpp        |  413 ++++++
 src/r_draw_span.cpp          |  866 ++++++++++++
 src/r_plane.cpp              |   76 +-
 src/r_segs.cpp               |    2 +-
 src/r_splats.c               |   12 -
 src/screen.c                 |  244 ++--
 14 files changed, 3336 insertions(+), 5466 deletions(-)
 rename src/{r_draw.c => r_draw.cpp} (92%)
 delete mode 100644 src/r_draw16.c
 delete mode 100644 src/r_draw8.c
 delete mode 100644 src/r_draw8_flat.c
 delete mode 100644 src/r_draw8_npo2.c
 create mode 100644 src/r_draw_column.cpp
 create mode 100644 src/r_draw_span.cpp

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index af7b84605..045452975 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -72,7 +72,7 @@ add_executable(SRB2SDL2 MACOSX_BUNDLE WIN32
 	r_data.c
 	r_debug.cpp
 	r_debug_parser.cpp
-	r_draw.c
+	r_draw.cpp
 	r_fps.c
 	r_main.cpp
 	r_plane.cpp
diff --git a/src/libdivide.h b/src/libdivide.h
index 1a589c7e5..96dd27211 100644
--- a/src/libdivide.h
+++ b/src/libdivide.h
@@ -1,124 +1,112 @@
 // libdivide.h - Optimized integer division
 // https://libdivide.com
 //
-// Copyright (C) 2010 - 2019 ridiculous_fish, <libdivide@ridiculousfish.com>
-// Copyright (C) 2016 - 2019 Kim Walisch, <kim.walisch@gmail.com>
+// Copyright (C) 2010 - 2022 ridiculous_fish, <libdivide@ridiculousfish.com>
+// Copyright (C) 2016 - 2022 Kim Walisch, <kim.walisch@gmail.com>
 //
 // libdivide is dual-licensed under the Boost or zlib licenses.
 // You may use libdivide under the terms of either of these.
-// See LICENSE.txt in the libdivide source code repository for more details.
-
-
-// NOTICE: This is an altered source version of libdivide.
-// Libdivide is used here under the terms of the zlib license.
-// Here is the zlib license text from https://github.com/ridiculousfish/libdivide/blob/master/LICENSE.txt
-/*
-  zlib License
-  ------------
-
-  Copyright (C) 2010 - 2019 ridiculous_fish, <libdivide@ridiculousfish.com>
-  Copyright (C) 2016 - 2019 Kim Walisch, <kim.walisch@gmail.com>
-
-  This software is provided 'as-is', without any express or implied
-  warranty.  In no event will the authors be held liable for any damages
-  arising from the use of this software.
-
-  Permission is granted to anyone to use this software for any purpose,
-  including commercial applications, and to alter it and redistribute it
-  freely, subject to the following restrictions:
-
-  1. The origin of this software must not be misrepresented; you must not
-     claim that you wrote the original software. If you use this software
-     in a product, an acknowledgment in the product documentation would be
-     appreciated but is not required.
-  2. Altered source versions must be plainly marked as such, and must not be
-     misrepresented as being the original software.
-  3. This notice may not be removed or altered from any source distribution.
-*/
-
-
-// This version of libdivide has been modified for use with SRB2.
-// Changes made include:
-//     - unused parts commented out (to avoid the need to fix C90 compilation issues with them)
-//     - C90 compilation issues fixed with used parts
-//     - use I_Error for errors
+// See LICENSE.txt for more details.
 
 #ifndef LIBDIVIDE_H
 #define LIBDIVIDE_H
 
-#define LIBDIVIDE_VERSION "3.0"
-#define LIBDIVIDE_VERSION_MAJOR 3
+#define LIBDIVIDE_VERSION "5.0"
+#define LIBDIVIDE_VERSION_MAJOR 5
 #define LIBDIVIDE_VERSION_MINOR 0
 
 #include <stdint.h>
-
-#if defined(__cplusplus)
-    #include <cstdlib>
-    #include <cstdio>
-    #include <type_traits>
-#else
-    #include <stdlib.h>
-    #include <stdio.h>
+#if !defined(__AVR__)
+#include <stdio.h>
+#include <stdlib.h>
 #endif
 
-#if defined(LIBDIVIDE_AVX512)
-    #include <immintrin.h>
-#elif defined(LIBDIVIDE_AVX2)
-    #include <immintrin.h>
-#elif defined(LIBDIVIDE_SSE2)
-    #include <emmintrin.h>
+#if defined(LIBDIVIDE_SSE2)
+#include <emmintrin.h>
+#endif
+#if defined(LIBDIVIDE_AVX2) || defined(LIBDIVIDE_AVX512)
+#include <immintrin.h>
+#endif
+#if defined(LIBDIVIDE_NEON)
+#include <arm_neon.h>
 #endif
 
 #if defined(_MSC_VER)
-    #include <intrin.h>
-    // disable warning C4146: unary minus operator applied
-    // to unsigned type, result still unsigned
-    #pragma warning(disable: 4146)
-    #define LIBDIVIDE_VC
+#include <intrin.h>
+#pragma warning(push)
+// disable warning C4146: unary minus operator applied
+// to unsigned type, result still unsigned
+#pragma warning(disable : 4146)
+// disable warning C4204: nonstandard extension used : non-constant aggregate
+// initializer
+//
+// It's valid C99
+#pragma warning(disable : 4204)
+#define LIBDIVIDE_VC
 #endif
 
 #if !defined(__has_builtin)
-    #define __has_builtin(x) 0
+#define __has_builtin(x) 0
 #endif
 
 #if defined(__SIZEOF_INT128__)
-    #define HAS_INT128_T
-    // clang-cl on Windows does not yet support 128-bit division
-    #if !(defined(__clang__) && defined(LIBDIVIDE_VC))
-        #define HAS_INT128_DIV
-    #endif
+#define HAS_INT128_T
+// clang-cl on Windows does not yet support 128-bit division
+#if !(defined(__clang__) && defined(LIBDIVIDE_VC))
+#define HAS_INT128_DIV
+#endif
 #endif
 
 #if defined(__x86_64__) || defined(_M_X64)
-    #define LIBDIVIDE_X86_64
+#define LIBDIVIDE_X86_64
 #endif
 
 #if defined(__i386__)
-    #define LIBDIVIDE_i386
+#define LIBDIVIDE_i386
 #endif
 
 #if defined(__GNUC__) || defined(__clang__)
-    #define LIBDIVIDE_GCC_STYLE_ASM
+#define LIBDIVIDE_GCC_STYLE_ASM
 #endif
 
 #if defined(__cplusplus) || defined(LIBDIVIDE_VC)
-    #define LIBDIVIDE_FUNCTION __FUNCTION__
+#define LIBDIVIDE_FUNCTION __FUNCTION__
 #else
-    #define LIBDIVIDE_FUNCTION __func__
+#define LIBDIVIDE_FUNCTION __func__
 #endif
 
-#define LIBDIVIDE_ERROR(msg) \
-    I_Error("libdivide.h:%d: %s(): Error: %s\n", \
-        __LINE__, LIBDIVIDE_FUNCTION, msg);
+// Set up forced inlining if possible.
+// We need both the attribute and keyword to avoid "might not be inlineable" warnings.
+#ifdef __has_attribute
+#if __has_attribute(always_inline)
+#define LIBDIVIDE_INLINE __attribute__((always_inline)) inline
+#endif
+#endif
+#ifndef LIBDIVIDE_INLINE
+#define LIBDIVIDE_INLINE inline
+#endif
 
-#if defined(LIBDIVIDE_ASSERTIONS_ON)
-    #define LIBDIVIDE_ASSERT(x) \
-        if (!(x)) { \
-            I_Error("libdivide.h:%d: %s(): Assertion failed: %s\n", \
-                __LINE__, LIBDIVIDE_FUNCTION, #x); \
-        }
+#if defined(__AVR__)
+#define LIBDIVIDE_ERROR(msg)
 #else
-    #define LIBDIVIDE_ASSERT(x)
+#define LIBDIVIDE_ERROR(msg)                                                                     \
+    do {                                                                                         \
+        fprintf(stderr, "libdivide.h:%d: %s(): Error: %s\n", __LINE__, LIBDIVIDE_FUNCTION, msg); \
+        abort();                                                                                 \
+    } while (0)
+#endif
+
+#if defined(LIBDIVIDE_ASSERTIONS_ON) && !defined(__AVR__)
+#define LIBDIVIDE_ASSERT(x)                                                           \
+    do {                                                                              \
+        if (!(x)) {                                                                   \
+            fprintf(stderr, "libdivide.h:%d: %s(): Assertion failed: %s\n", __LINE__, \
+                LIBDIVIDE_FUNCTION, #x);                                              \
+            abort();                                                                  \
+        }                                                                             \
+    } while (0)
+#else
+#define LIBDIVIDE_ASSERT(x)
 #endif
 
 #ifdef __cplusplus
@@ -131,6 +119,16 @@ namespace libdivide {
 // by up to 10% because of reduced memory bandwidth.
 #pragma pack(push, 1)
 
+struct libdivide_u16_t {
+    uint16_t magic;
+    uint8_t more;
+};
+
+struct libdivide_s16_t {
+    int16_t magic;
+    uint8_t more;
+};
+
 struct libdivide_u32_t {
     uint32_t magic;
     uint8_t more;
@@ -151,6 +149,16 @@ struct libdivide_s64_t {
     uint8_t more;
 };
 
+struct libdivide_u16_branchfree_t {
+    uint16_t magic;
+    uint8_t more;
+};
+
+struct libdivide_s16_branchfree_t {
+    int16_t magic;
+    uint8_t more;
+};
+
 struct libdivide_u32_branchfree_t {
     uint32_t magic;
     uint8_t more;
@@ -206,60 +214,105 @@ struct libdivide_s64_branchfree_t {
 // whether the divisor is negated. In branchfree strategy, it is not negated.
 
 enum {
+    LIBDIVIDE_16_SHIFT_MASK = 0x1F,
     LIBDIVIDE_32_SHIFT_MASK = 0x1F,
     LIBDIVIDE_64_SHIFT_MASK = 0x3F,
     LIBDIVIDE_ADD_MARKER = 0x40,
     LIBDIVIDE_NEGATIVE_DIVISOR = 0x80
 };
 
-//static inline struct libdivide_s32_t libdivide_s32_gen(int32_t d);
-static inline struct libdivide_u32_t libdivide_u32_gen(uint32_t d);
-//static inline struct libdivide_s64_t libdivide_s64_gen(int64_t d);
-//static inline struct libdivide_u64_t libdivide_u64_gen(uint64_t d);
+static LIBDIVIDE_INLINE struct libdivide_s16_t libdivide_s16_gen(int16_t d);
+static LIBDIVIDE_INLINE struct libdivide_u16_t libdivide_u16_gen(uint16_t d);
+static LIBDIVIDE_INLINE struct libdivide_s32_t libdivide_s32_gen(int32_t d);
+static LIBDIVIDE_INLINE struct libdivide_u32_t libdivide_u32_gen(uint32_t d);
+static LIBDIVIDE_INLINE struct libdivide_s64_t libdivide_s64_gen(int64_t d);
+static LIBDIVIDE_INLINE struct libdivide_u64_t libdivide_u64_gen(uint64_t d);
 
-/*static inline struct libdivide_s32_branchfree_t libdivide_s32_branchfree_gen(int32_t d);
-static inline struct libdivide_u32_branchfree_t libdivide_u32_branchfree_gen(uint32_t d);
-static inline struct libdivide_s64_branchfree_t libdivide_s64_branchfree_gen(int64_t d);
-static inline struct libdivide_u64_branchfree_t libdivide_u64_branchfree_gen(uint64_t d);*/
+static LIBDIVIDE_INLINE struct libdivide_s16_branchfree_t libdivide_s16_branchfree_gen(int16_t d);
+static LIBDIVIDE_INLINE struct libdivide_u16_branchfree_t libdivide_u16_branchfree_gen(uint16_t d);
+static LIBDIVIDE_INLINE struct libdivide_s32_branchfree_t libdivide_s32_branchfree_gen(int32_t d);
+static LIBDIVIDE_INLINE struct libdivide_u32_branchfree_t libdivide_u32_branchfree_gen(uint32_t d);
+static LIBDIVIDE_INLINE struct libdivide_s64_branchfree_t libdivide_s64_branchfree_gen(int64_t d);
+static LIBDIVIDE_INLINE struct libdivide_u64_branchfree_t libdivide_u64_branchfree_gen(uint64_t d);
 
-//static inline int32_t  libdivide_s32_do(int32_t numer, const struct libdivide_s32_t *denom);
-static inline uint32_t libdivide_u32_do(uint32_t numer, const struct libdivide_u32_t *denom);
-//static inline int64_t  libdivide_s64_do(int64_t numer, const struct libdivide_s64_t *denom);
-//static inline uint64_t libdivide_u64_do(uint64_t numer, const struct libdivide_u64_t *denom);
+static LIBDIVIDE_INLINE int16_t libdivide_s16_do_raw(int16_t numer, int16_t magic, uint8_t more);
+static LIBDIVIDE_INLINE int16_t libdivide_s16_do(
+    int16_t numer, const struct libdivide_s16_t *denom);
+static LIBDIVIDE_INLINE uint16_t libdivide_u16_do_raw(uint16_t numer, uint16_t magic, uint8_t more);
+static LIBDIVIDE_INLINE uint16_t libdivide_u16_do(
+    uint16_t numer, const struct libdivide_u16_t *denom);
+static LIBDIVIDE_INLINE int32_t libdivide_s32_do(
+    int32_t numer, const struct libdivide_s32_t *denom);
+static LIBDIVIDE_INLINE uint32_t libdivide_u32_do(
+    uint32_t numer, const struct libdivide_u32_t *denom);
+static LIBDIVIDE_INLINE int64_t libdivide_s64_do(
+    int64_t numer, const struct libdivide_s64_t *denom);
+static LIBDIVIDE_INLINE uint64_t libdivide_u64_do(
+    uint64_t numer, const struct libdivide_u64_t *denom);
 
-/*static inline int32_t  libdivide_s32_branchfree_do(int32_t numer, const struct libdivide_s32_branchfree_t *denom);
-static inline uint32_t libdivide_u32_branchfree_do(uint32_t numer, const struct libdivide_u32_branchfree_t *denom);
-static inline int64_t  libdivide_s64_branchfree_do(int64_t numer, const struct libdivide_s64_branchfree_t *denom);
-static inline uint64_t libdivide_u64_branchfree_do(uint64_t numer, const struct libdivide_u64_branchfree_t *denom);*/
+static LIBDIVIDE_INLINE int16_t libdivide_s16_branchfree_do(
+    int16_t numer, const struct libdivide_s16_branchfree_t *denom);
+static LIBDIVIDE_INLINE uint16_t libdivide_u16_branchfree_do(
+    uint16_t numer, const struct libdivide_u16_branchfree_t *denom);
+static LIBDIVIDE_INLINE int32_t libdivide_s32_branchfree_do(
+    int32_t numer, const struct libdivide_s32_branchfree_t *denom);
+static LIBDIVIDE_INLINE uint32_t libdivide_u32_branchfree_do(
+    uint32_t numer, const struct libdivide_u32_branchfree_t *denom);
+static LIBDIVIDE_INLINE int64_t libdivide_s64_branchfree_do(
+    int64_t numer, const struct libdivide_s64_branchfree_t *denom);
+static LIBDIVIDE_INLINE uint64_t libdivide_u64_branchfree_do(
+    uint64_t numer, const struct libdivide_u64_branchfree_t *denom);
 
-/*static inline int32_t  libdivide_s32_recover(const struct libdivide_s32_t *denom);
-static inline uint32_t libdivide_u32_recover(const struct libdivide_u32_t *denom);
-static inline int64_t  libdivide_s64_recover(const struct libdivide_s64_t *denom);
-static inline uint64_t libdivide_u64_recover(const struct libdivide_u64_t *denom);*/
+static LIBDIVIDE_INLINE int16_t libdivide_s16_recover(const struct libdivide_s16_t *denom);
+static LIBDIVIDE_INLINE uint16_t libdivide_u16_recover(const struct libdivide_u16_t *denom);
+static LIBDIVIDE_INLINE int32_t libdivide_s32_recover(const struct libdivide_s32_t *denom);
+static LIBDIVIDE_INLINE uint32_t libdivide_u32_recover(const struct libdivide_u32_t *denom);
+static LIBDIVIDE_INLINE int64_t libdivide_s64_recover(const struct libdivide_s64_t *denom);
+static LIBDIVIDE_INLINE uint64_t libdivide_u64_recover(const struct libdivide_u64_t *denom);
 
-/*static inline int32_t  libdivide_s32_branchfree_recover(const struct libdivide_s32_branchfree_t *denom);
-static inline uint32_t libdivide_u32_branchfree_recover(const struct libdivide_u32_branchfree_t *denom);
-static inline int64_t  libdivide_s64_branchfree_recover(const struct libdivide_s64_branchfree_t *denom);
-static inline uint64_t libdivide_u64_branchfree_recover(const struct libdivide_u64_branchfree_t *denom);*/
+static LIBDIVIDE_INLINE int16_t libdivide_s16_branchfree_recover(
+    const struct libdivide_s16_branchfree_t *denom);
+static LIBDIVIDE_INLINE uint16_t libdivide_u16_branchfree_recover(
+    const struct libdivide_u16_branchfree_t *denom);
+static LIBDIVIDE_INLINE int32_t libdivide_s32_branchfree_recover(
+    const struct libdivide_s32_branchfree_t *denom);
+static LIBDIVIDE_INLINE uint32_t libdivide_u32_branchfree_recover(
+    const struct libdivide_u32_branchfree_t *denom);
+static LIBDIVIDE_INLINE int64_t libdivide_s64_branchfree_recover(
+    const struct libdivide_s64_branchfree_t *denom);
+static LIBDIVIDE_INLINE uint64_t libdivide_u64_branchfree_recover(
+    const struct libdivide_u64_branchfree_t *denom);
 
 //////// Internal Utility Functions
 
-static inline uint32_t libdivide_mullhi_u32(uint32_t x, uint32_t y) {
+static LIBDIVIDE_INLINE uint16_t libdivide_mullhi_u16(uint16_t x, uint16_t y) {
+    uint32_t xl = x, yl = y;
+    uint32_t rl = xl * yl;
+    return (uint16_t)(rl >> 16);
+}
+
+static LIBDIVIDE_INLINE int16_t libdivide_mullhi_s16(int16_t x, int16_t y) {
+    int32_t xl = x, yl = y;
+    int32_t rl = xl * yl;
+    // needs to be arithmetic shift
+    return (int16_t)(rl >> 16);
+}
+
+static LIBDIVIDE_INLINE uint32_t libdivide_mullhi_u32(uint32_t x, uint32_t y) {
     uint64_t xl = x, yl = y;
     uint64_t rl = xl * yl;
     return (uint32_t)(rl >> 32);
 }
 
-static inline int32_t libdivide_mullhi_s32(int32_t x, int32_t y) {
+static LIBDIVIDE_INLINE int32_t libdivide_mullhi_s32(int32_t x, int32_t y) {
     int64_t xl = x, yl = y;
     int64_t rl = xl * yl;
     // needs to be arithmetic shift
     return (int32_t)(rl >> 32);
 }
 
-static inline uint64_t libdivide_mullhi_u64(uint64_t x, uint64_t y) {
-#if defined(LIBDIVIDE_VC) && \
-    defined(LIBDIVIDE_X86_64)
+static LIBDIVIDE_INLINE uint64_t libdivide_mullhi_u64(uint64_t x, uint64_t y) {
+#if defined(LIBDIVIDE_VC) && defined(LIBDIVIDE_X86_64)
     return __umulh(x, y);
 #elif defined(HAS_INT128_T)
     __uint128_t xl = x, yl = y;
@@ -284,9 +337,8 @@ static inline uint64_t libdivide_mullhi_u64(uint64_t x, uint64_t y) {
 #endif
 }
 
-static inline int64_t libdivide_mullhi_s64(int64_t x, int64_t y) {
-#if defined(LIBDIVIDE_VC) && \
-    defined(LIBDIVIDE_X86_64)
+static LIBDIVIDE_INLINE int64_t libdivide_mullhi_s64(int64_t x, int64_t y) {
+#if defined(LIBDIVIDE_VC) && defined(LIBDIVIDE_X86_64)
     return __mulh(x, y);
 #elif defined(HAS_INT128_T)
     __int128_t xl = x, yl = y;
@@ -307,9 +359,41 @@ static inline int64_t libdivide_mullhi_s64(int64_t x, int64_t y) {
 #endif
 }
 
-static inline int32_t libdivide_count_leading_zeros32(uint32_t val) {
-#if defined(__GNUC__) || \
-    __has_builtin(__builtin_clz)
+static LIBDIVIDE_INLINE int16_t libdivide_count_leading_zeros16(uint16_t val) {
+#if defined(__AVR__)
+    // Fast way to count leading zeros
+    // On the AVR 8-bit architecture __builtin_clz() works on a int16_t.
+    return __builtin_clz(val);
+#elif defined(__GNUC__) || __has_builtin(__builtin_clz)
+    // Fast way to count leading zeros
+    return __builtin_clz(val) - 16;
+#elif defined(LIBDIVIDE_VC)
+    unsigned long result;
+    if (_BitScanReverse(&result, (unsigned long)val)) {
+        return (int16_t)(15 - result);
+    }
+    return 0;
+#else
+    if (val == 0) return 16;
+    int16_t result = 4;
+    uint16_t hi = 0xFU << 12;
+    while ((val & hi) == 0) {
+        hi >>= 4;
+        result += 4;
+    }
+    while (val & hi) {
+        result -= 1;
+        hi <<= 1;
+    }
+    return result;
+#endif
+}
+
+static LIBDIVIDE_INLINE int32_t libdivide_count_leading_zeros32(uint32_t val) {
+#if defined(__AVR__)
+    // Fast way to count leading zeros
+    return __builtin_clzl(val);
+#elif defined(__GNUC__) || __has_builtin(__builtin_clz)
     // Fast way to count leading zeros
     return __builtin_clz(val);
 #elif defined(LIBDIVIDE_VC)
@@ -319,8 +403,7 @@ static inline int32_t libdivide_count_leading_zeros32(uint32_t val) {
     }
     return 0;
 #else
-    if (val == 0)
-        return 32;
+    if (val == 0) return 32;
     int32_t result = 8;
     uint32_t hi = 0xFFU << 24;
     while ((val & hi) == 0) {
@@ -335,9 +418,8 @@ static inline int32_t libdivide_count_leading_zeros32(uint32_t val) {
 #endif
 }
 
-static inline int32_t libdivide_count_leading_zeros64(uint64_t val) {
-#if defined(__GNUC__) || \
-    __has_builtin(__builtin_clzll)
+static LIBDIVIDE_INLINE int32_t libdivide_count_leading_zeros64(uint64_t val) {
+#if defined(__GNUC__) || __has_builtin(__builtin_clzll)
     // Fast way to count leading zeros
     return __builtin_clzll(val);
 #elif defined(LIBDIVIDE_VC) && defined(_WIN64)
@@ -354,17 +436,25 @@ static inline int32_t libdivide_count_leading_zeros64(uint64_t val) {
 #endif
 }
 
+// libdivide_32_div_16_to_16: divides a 32-bit uint {u1, u0} by a 16-bit
+// uint {v}. The result must fit in 16 bits.
+// Returns the quotient directly and the remainder in *r
+static LIBDIVIDE_INLINE uint16_t libdivide_32_div_16_to_16(
+    uint16_t u1, uint16_t u0, uint16_t v, uint16_t *r) {
+    uint32_t n = ((uint32_t)u1 << 16) | u0;
+    uint16_t result = (uint16_t)(n / v);
+    *r = (uint16_t)(n - result * (uint32_t)v);
+    return result;
+}
+
 // libdivide_64_div_32_to_32: divides a 64-bit uint {u1, u0} by a 32-bit
 // uint {v}. The result must fit in 32 bits.
 // Returns the quotient directly and the remainder in *r
-static inline uint32_t libdivide_64_div_32_to_32(uint32_t u1, uint32_t u0, uint32_t v, uint32_t *r) {
-#if (defined(LIBDIVIDE_i386) || defined(LIBDIVIDE_X86_64)) && \
-     defined(LIBDIVIDE_GCC_STYLE_ASM)
+static LIBDIVIDE_INLINE uint32_t libdivide_64_div_32_to_32(
+    uint32_t u1, uint32_t u0, uint32_t v, uint32_t *r) {
+#if (defined(LIBDIVIDE_i386) || defined(LIBDIVIDE_X86_64)) && defined(LIBDIVIDE_GCC_STYLE_ASM)
     uint32_t result;
-    __asm__("divl %[v]"
-            : "=a"(result), "=d"(*r)
-            : [v] "r"(v), "a"(u0), "d"(u1)
-            );
+    __asm__("divl %[v]" : "=a"(result), "=d"(*r) : [v] "r"(v), "a"(u0), "d"(u1));
     return result;
 #else
     uint64_t n = ((uint64_t)u1 << 32) | u0;
@@ -374,108 +464,115 @@ static inline uint32_t libdivide_64_div_32_to_32(uint32_t u1, uint32_t u0, uint3
 #endif
 }
 
-// libdivide_128_div_64_to_64: divides a 128-bit uint {u1, u0} by a 64-bit
-// uint {v}. The result must fit in 64 bits.
-// Returns the quotient directly and the remainder in *r
-/*static uint64_t libdivide_128_div_64_to_64(uint64_t u1, uint64_t u0, uint64_t v, uint64_t *r) {
-#if defined(LIBDIVIDE_X86_64) && \
-    defined(LIBDIVIDE_GCC_STYLE_ASM)
+// libdivide_128_div_64_to_64: divides a 128-bit uint {numhi, numlo} by a 64-bit uint {den}. The
+// result must fit in 64 bits. Returns the quotient directly and the remainder in *r
+static LIBDIVIDE_INLINE uint64_t libdivide_128_div_64_to_64(
+    uint64_t numhi, uint64_t numlo, uint64_t den, uint64_t *r) {
+    // N.B. resist the temptation to use __uint128_t here.
+    // In LLVM compiler-rt, it performs a 128/128 -> 128 division which is many times slower than
+    // necessary. In gcc it's better but still slower than the divlu implementation, perhaps because
+    // it's not LIBDIVIDE_INLINEd.
+#if defined(LIBDIVIDE_X86_64) && defined(LIBDIVIDE_GCC_STYLE_ASM)
     uint64_t result;
-    __asm__("divq %[v]"
-            : "=a"(result), "=d"(*r)
-            : [v] "r"(v), "a"(u0), "d"(u1)
-            );
-    return result;
-#elif defined(HAS_INT128_T) && \
-      defined(HAS_INT128_DIV)
-    __uint128_t n = ((__uint128_t)u1 << 64) | u0;
-    uint64_t result = (uint64_t)(n / v);
-    *r = (uint64_t)(n - result * (__uint128_t)v);
+    __asm__("divq %[v]" : "=a"(result), "=d"(*r) : [v] "r"(den), "a"(numlo), "d"(numhi));
     return result;
 #else
-    // Code taken from Hacker's Delight:
-    // http://www.hackersdelight.org/HDcode/divlu.c.
-    // License permits inclusion here per:
-    // http://www.hackersdelight.org/permissions.htm
+    // We work in base 2**32.
+    // A uint32 holds a single digit. A uint64 holds two digits.
+    // Our numerator is conceptually [num3, num2, num1, num0].
+    // Our denominator is [den1, den0].
+    const uint64_t b = ((uint64_t)1 << 32);
 
-    const uint64_t b = (1ULL << 32); // Number base (32 bits)
-    uint64_t un1, un0; // Norm. dividend LSD's
-    uint64_t vn1, vn0; // Norm. divisor digits
-    uint64_t q1, q0; // Quotient digits
-    uint64_t un64, un21, un10; // Dividend digit pairs
-    uint64_t rhat; // A remainder
-    int32_t s; // Shift amount for norm
+    // The high and low digits of our computed quotient.
+    uint32_t q1;
+    uint32_t q0;
 
-    // If overflow, set rem. to an impossible value,
-    // and return the largest possible quotient
-    if (u1 >= v) {
-        *r = (uint64_t) -1;
-        return (uint64_t) -1;
+    // The normalization shift factor.
+    int shift;
+
+    // The high and low digits of our denominator (after normalizing).
+    // Also the low 2 digits of our numerator (after normalizing).
+    uint32_t den1;
+    uint32_t den0;
+    uint32_t num1;
+    uint32_t num0;
+
+    // A partial remainder.
+    uint64_t rem;
+
+    // The estimated quotient, and its corresponding remainder (unrelated to true remainder).
+    uint64_t qhat;
+    uint64_t rhat;
+
+    // Variables used to correct the estimated quotient.
+    uint64_t c1;
+    uint64_t c2;
+
+    // Check for overflow and divide by 0.
+    if (numhi >= den) {
+        if (r != NULL) *r = ~0ull;
+        return ~0ull;
     }
 
-    // count leading zeros
-    s = libdivide_count_leading_zeros64(v);
-    if (s > 0) {
-        // Normalize divisor
-        v = v << s;
-        un64 = (u1 << s) | (u0 >> (64 - s));
-        un10 = u0 << s; // Shift dividend left
-    } else {
-        // Avoid undefined behavior of (u0 >> 64).
-        // The behavior is undefined if the right operand is
-        // negative, or greater than or equal to the length
-        // in bits of the promoted left operand.
-        un64 = u1;
-        un10 = u0;
-    }
+    // Determine the normalization factor. We multiply den by this, so that its leading digit is at
+    // least half b. In binary this means just shifting left by the number of leading zeros, so that
+    // there's a 1 in the MSB.
+    // We also shift numer by the same amount. This cannot overflow because numhi < den.
+    // The expression (-shift & 63) is the same as (64 - shift), except it avoids the UB of shifting
+    // by 64. The funny bitwise 'and' ensures that numlo does not get shifted into numhi if shift is
+    // 0. clang 11 has an x86 codegen bug here: see LLVM bug 50118. The sequence below avoids it.
+    shift = libdivide_count_leading_zeros64(den);
+    den <<= shift;
+    numhi <<= shift;
+    numhi |= (numlo >> (-shift & 63)) & (-(int64_t)shift >> 63);
+    numlo <<= shift;
 
-    // Break divisor up into two 32-bit digits
-    vn1 = v >> 32;
-    vn0 = v & 0xFFFFFFFF;
+    // Extract the low digits of the numerator and both digits of the denominator.
+    num1 = (uint32_t)(numlo >> 32);
+    num0 = (uint32_t)(numlo & 0xFFFFFFFFu);
+    den1 = (uint32_t)(den >> 32);
+    den0 = (uint32_t)(den & 0xFFFFFFFFu);
 
-    // Break right half of dividend into two digits
-    un1 = un10 >> 32;
-    un0 = un10 & 0xFFFFFFFF;
+    // We wish to compute q1 = [n3 n2 n1] / [d1 d0].
+    // Estimate q1 as [n3 n2] / [d1], and then correct it.
+    // Note while qhat may be 2 digits, q1 is always 1 digit.
+    qhat = numhi / den1;
+    rhat = numhi % den1;
+    c1 = qhat * den0;
+    c2 = rhat * b + num1;
+    if (c1 > c2) qhat -= (c1 - c2 > den) ? 2 : 1;
+    q1 = (uint32_t)qhat;
 
-    // Compute the first quotient digit, q1
-    q1 = un64 / vn1;
-    rhat = un64 - q1 * vn1;
+    // Compute the true (partial) remainder.
+    rem = numhi * b + num1 - q1 * den;
 
-    while (q1 >= b || q1 * vn0 > b * rhat + un1) {
-        q1 = q1 - 1;
-        rhat = rhat + vn1;
-        if (rhat >= b)
-            break;
-    }
+    // We wish to compute q0 = [rem1 rem0 n0] / [d1 d0].
+    // Estimate q0 as [rem1 rem0] / [d1] and correct it.
+    qhat = rem / den1;
+    rhat = rem % den1;
+    c1 = qhat * den0;
+    c2 = rhat * b + num0;
+    if (c1 > c2) qhat -= (c1 - c2 > den) ? 2 : 1;
+    q0 = (uint32_t)qhat;
 
-     // Multiply and subtract
-    un21 = un64 * b + un1 - q1 * v;
-
-    // Compute the second quotient digit
-    q0 = un21 / vn1;
-    rhat = un21 - q0 * vn1;
-
-    while (q0 >= b || q0 * vn0 > b * rhat + un0) {
-        q0 = q0 - 1;
-        rhat = rhat + vn1;
-        if (rhat >= b)
-            break;
-    }
-
-    *r = (un21 * b + un0 - q0 * v) >> s;
-    return q1 * b + q0;
+    // Return remainder if requested.
+    if (r != NULL) *r = (rem * b + num0 - q0 * den) >> shift;
+    return ((uint64_t)q1 << 32) | q0;
 #endif
-}*/
+}
+
+#if !(defined(HAS_INT128_T) && \
+      defined(HAS_INT128_DIV))
 
 // Bitshift a u128 in place, left (signed_shift > 0) or right (signed_shift < 0)
-static inline void libdivide_u128_shift(uint64_t *u1, uint64_t *u0, int32_t signed_shift) {
+static LIBDIVIDE_INLINE void libdivide_u128_shift(
+    uint64_t *u1, uint64_t *u0, int32_t signed_shift) {
     if (signed_shift > 0) {
         uint32_t shift = signed_shift;
         *u1 <<= shift;
         *u1 |= *u0 >> (64 - shift);
         *u0 <<= shift;
-    }
-    else if (signed_shift < 0) {
+    } else if (signed_shift < 0) {
         uint32_t shift = -signed_shift;
         *u0 >>= shift;
         *u0 |= *u1 << (64 - shift);
@@ -483,10 +580,12 @@ static inline void libdivide_u128_shift(uint64_t *u1, uint64_t *u0, int32_t sign
     }
 }
 
+#endif
+
 // Computes a 128 / 128 -> 64 bit division, with a 128 bit remainder.
-/*static uint64_t libdivide_128_div_128_to_64(uint64_t u_hi, uint64_t u_lo, uint64_t v_hi, uint64_t v_lo, uint64_t *r_hi, uint64_t *r_lo) {
-#if defined(HAS_INT128_T) && \
-    defined(HAS_INT128_DIV)
+static LIBDIVIDE_INLINE uint64_t libdivide_128_div_128_to_64(
+    uint64_t u_hi, uint64_t u_lo, uint64_t v_hi, uint64_t v_lo, uint64_t *r_hi, uint64_t *r_lo) {
+#if defined(HAS_INT128_T) && defined(HAS_INT128_DIV)
     __uint128_t ufull = u_hi;
     __uint128_t vfull = v_hi;
     ufull = (ufull << 64) | u_lo;
@@ -499,7 +598,10 @@ static inline void libdivide_u128_shift(uint64_t *u1, uint64_t *u0, int32_t sign
 #else
     // Adapted from "Unsigned Doubleword Division" in Hacker's Delight
     // We want to compute u / v
-    typedef struct { uint64_t hi; uint64_t lo; } u128_t;
+    typedef struct {
+        uint64_t hi;
+        uint64_t lo;
+    } u128_t;
     u128_t u = {u_hi, u_lo};
     u128_t v = {v_hi, v_lo};
 
@@ -519,7 +621,7 @@ static inline void libdivide_u128_shift(uint64_t *u1, uint64_t *u0, int32_t sign
     // Normalize the divisor so its MSB is 1
     u128_t v1t = v;
     libdivide_u128_shift(&v1t.hi, &v1t.lo, n);
-    uint64_t v1 = v1t.hi; // i.e. v1 = v1t >> 64
+    uint64_t v1 = v1t.hi;  // i.e. v1 = v1t >> 64
 
     // To ensure no overflow
     u128_t u1 = u;
@@ -537,7 +639,7 @@ static inline void libdivide_u128_shift(uint64_t *u1, uint64_t *u0, int32_t sign
     // Make q0 correct or too small by 1
     // Equivalent to `if (q0 != 0) q0 = q0 - 1;`
     if (q0.hi != 0 || q0.lo != 0) {
-        q0.hi -= (q0.lo == 0); // borrow
+        q0.hi -= (q0.lo == 0);  // borrow
         q0.lo -= 1;
     }
 
@@ -549,22 +651,21 @@ static inline void libdivide_u128_shift(uint64_t *u1, uint64_t *u0, int32_t sign
     // Each term is 128 bit
     // High half of full product (upper 128 bits!) are dropped
     u128_t q0v = {0, 0};
-    q0v.hi = q0.hi*v.lo + q0.lo*v.hi + libdivide_mullhi_u64(q0.lo, v.lo);
-    q0v.lo = q0.lo*v.lo;
+    q0v.hi = q0.hi * v.lo + q0.lo * v.hi + libdivide_mullhi_u64(q0.lo, v.lo);
+    q0v.lo = q0.lo * v.lo;
 
     // Compute u - q0v as u_q0v
     // This is the remainder
     u128_t u_q0v = u;
-    u_q0v.hi -= q0v.hi + (u.lo < q0v.lo); // second term is borrow
+    u_q0v.hi -= q0v.hi + (u.lo < q0v.lo);  // second term is borrow
     u_q0v.lo -= q0v.lo;
 
     // Check if u_q0v >= v
     // This checks if our remainder is larger than the divisor
-    if ((u_q0v.hi > v.hi) ||
-        (u_q0v.hi == v.hi && u_q0v.lo >= v.lo)) {
+    if ((u_q0v.hi > v.hi) || (u_q0v.hi == v.hi && u_q0v.lo >= v.lo)) {
         // Increment q0
         q0.lo += 1;
-        q0.hi += (q0.lo == 0); // carry
+        q0.hi += (q0.lo == 0);  // carry
 
         // Subtract v from remainder
         u_q0v.hi -= v.hi + (u_q0v.lo < v.lo);
@@ -577,19 +678,182 @@ static inline void libdivide_u128_shift(uint64_t *u1, uint64_t *u0, int32_t sign
     LIBDIVIDE_ASSERT(q0.hi == 0);
     return q0.lo;
 #endif
-}*/
+}
 
-////////// UINT32
-
-static inline struct libdivide_u32_t libdivide_internal_u32_gen(uint32_t d, int branchfree) {
-    struct libdivide_u32_t result;
-    uint32_t floor_log_2_d;
+////////// UINT16
 
+static LIBDIVIDE_INLINE struct libdivide_u16_t libdivide_internal_u16_gen(
+    uint16_t d, int branchfree) {
     if (d == 0) {
         LIBDIVIDE_ERROR("divider must be != 0");
     }
 
-    floor_log_2_d = 31 - libdivide_count_leading_zeros32(d);
+    struct libdivide_u16_t result;
+    uint8_t floor_log_2_d = (uint8_t)(15 - libdivide_count_leading_zeros16(d));
+
+    // Power of 2
+    if ((d & (d - 1)) == 0) {
+        // We need to subtract 1 from the shift value in case of an unsigned
+        // branchfree divider because there is a hardcoded right shift by 1
+        // in its division algorithm. Because of this we also need to add back
+        // 1 in its recovery algorithm.
+        result.magic = 0;
+        result.more = (uint8_t)(floor_log_2_d - (branchfree != 0));
+    } else {
+        uint8_t more;
+        uint16_t rem, proposed_m;
+        proposed_m = libdivide_32_div_16_to_16((uint16_t)1 << floor_log_2_d, 0, d, &rem);
+
+        LIBDIVIDE_ASSERT(rem > 0 && rem < d);
+        const uint16_t e = d - rem;
+
+        // This power works if e < 2**floor_log_2_d.
+        if (!branchfree && (e < ((uint16_t)1 << floor_log_2_d))) {
+            // This power works
+            more = floor_log_2_d;
+        } else {
+            // We have to use the general 17-bit algorithm.  We need to compute
+            // (2**power) / d. However, we already have (2**(power-1))/d and
+            // its remainder.  By doubling both, and then correcting the
+            // remainder, we can compute the larger division.
+            // don't care about overflow here - in fact, we expect it
+            proposed_m += proposed_m;
+            const uint16_t twice_rem = rem + rem;
+            if (twice_rem >= d || twice_rem < rem) proposed_m += 1;
+            more = floor_log_2_d | LIBDIVIDE_ADD_MARKER;
+        }
+        result.magic = 1 + proposed_m;
+        result.more = more;
+        // result.more's shift should in general be ceil_log_2_d. But if we
+        // used the smaller power, we subtract one from the shift because we're
+        // using the smaller power. If we're using the larger power, we
+        // subtract one from the shift because it's taken care of by the add
+        // indicator. So floor_log_2_d happens to be correct in both cases.
+    }
+    return result;
+}
+
+struct libdivide_u16_t libdivide_u16_gen(uint16_t d) {
+    return libdivide_internal_u16_gen(d, 0);
+}
+
+struct libdivide_u16_branchfree_t libdivide_u16_branchfree_gen(uint16_t d) {
+    if (d == 1) {
+        LIBDIVIDE_ERROR("branchfree divider must be != 1");
+    }
+    struct libdivide_u16_t tmp = libdivide_internal_u16_gen(d, 1);
+    struct libdivide_u16_branchfree_t ret = {
+        tmp.magic, (uint8_t)(tmp.more & LIBDIVIDE_16_SHIFT_MASK)};
+    return ret;
+}
+
+// The original libdivide_u16_do takes a const pointer. However, this cannot be used
+// with a compile time constant libdivide_u16_t: it will generate a warning about
+// taking the address of a temporary. Hence this overload.
+uint16_t libdivide_u16_do_raw(uint16_t numer, uint16_t magic, uint8_t more) {
+    if (!magic) {
+        return numer >> more;
+    } else {
+        uint16_t q = libdivide_mullhi_u16(magic, numer);
+        if (more & LIBDIVIDE_ADD_MARKER) {
+            uint16_t t = ((numer - q) >> 1) + q;
+            return t >> (more & LIBDIVIDE_16_SHIFT_MASK);
+        } else {
+            // All upper bits are 0,
+            // don't need to mask them off.
+            return q >> more;
+        }
+    }
+}
+
+uint16_t libdivide_u16_do(uint16_t numer, const struct libdivide_u16_t *denom) {
+    return libdivide_u16_do_raw(numer, denom->magic, denom->more);
+}
+
+uint16_t libdivide_u16_branchfree_do(
+    uint16_t numer, const struct libdivide_u16_branchfree_t *denom) {
+    uint16_t q = libdivide_mullhi_u16(denom->magic, numer);
+    uint16_t t = ((numer - q) >> 1) + q;
+    return t >> denom->more;
+}
+
+uint16_t libdivide_u16_recover(const struct libdivide_u16_t *denom) {
+    uint8_t more = denom->more;
+    uint8_t shift = more & LIBDIVIDE_16_SHIFT_MASK;
+
+    if (!denom->magic) {
+        return (uint16_t)1 << shift;
+    } else if (!(more & LIBDIVIDE_ADD_MARKER)) {
+        // We compute q = n/d = n*m / 2^(16 + shift)
+        // Therefore we have d = 2^(16 + shift) / m
+        // We need to ceil it.
+        // We know d is not a power of 2, so m is not a power of 2,
+        // so we can just add 1 to the floor
+        uint16_t hi_dividend = (uint16_t)1 << shift;
+        uint16_t rem_ignored;
+        return 1 + libdivide_32_div_16_to_16(hi_dividend, 0, denom->magic, &rem_ignored);
+    } else {
+        // Here we wish to compute d = 2^(16+shift+1)/(m+2^16).
+        // Notice (m + 2^16) is a 17 bit number. Use 32 bit division for now
+        // Also note that shift may be as high as 15, so shift + 1 will
+        // overflow. So we have to compute it as 2^(16+shift)/(m+2^16), and
+        // then double the quotient and remainder.
+        uint32_t half_n = (uint32_t)1 << (16 + shift);
+        uint32_t d = ((uint32_t)1 << 16) | denom->magic;
+        // Note that the quotient is guaranteed <= 16 bits, but the remainder
+        // may need 17!
+        uint16_t half_q = (uint16_t)(half_n / d);
+        uint32_t rem = half_n % d;
+        // We computed 2^(16+shift)/(m+2^16)
+        // Need to double it, and then add 1 to the quotient if doubling th
+        // remainder would increase the quotient.
+        // Note that rem<<1 cannot overflow, since rem < d and d is 17 bits
+        uint16_t full_q = half_q + half_q + ((rem << 1) >= d);
+
+        // We rounded down in gen (hence +1)
+        return full_q + 1;
+    }
+}
+
+uint16_t libdivide_u16_branchfree_recover(const struct libdivide_u16_branchfree_t *denom) {
+    uint8_t more = denom->more;
+    uint8_t shift = more & LIBDIVIDE_16_SHIFT_MASK;
+
+    if (!denom->magic) {
+        return (uint16_t)1 << (shift + 1);
+    } else {
+        // Here we wish to compute d = 2^(16+shift+1)/(m+2^16).
+        // Notice (m + 2^16) is a 17 bit number. Use 32 bit division for now
+        // Also note that shift may be as high as 15, so shift + 1 will
+        // overflow. So we have to compute it as 2^(16+shift)/(m+2^16), and
+        // then double the quotient and remainder.
+        uint32_t half_n = (uint32_t)1 << (16 + shift);
+        uint32_t d = ((uint32_t)1 << 16) | denom->magic;
+        // Note that the quotient is guaranteed <= 16 bits, but the remainder
+        // may need 17!
+        uint16_t half_q = (uint16_t)(half_n / d);
+        uint32_t rem = half_n % d;
+        // We computed 2^(16+shift)/(m+2^16)
+        // Need to double it, and then add 1 to the quotient if doubling th
+        // remainder would increase the quotient.
+        // Note that rem<<1 cannot overflow, since rem < d and d is 33 bits
+        uint16_t full_q = half_q + half_q + ((rem << 1) >= d);
+
+        // We rounded down in gen (hence +1)
+        return full_q + 1;
+    }
+}
+
+////////// UINT32
+
+static LIBDIVIDE_INLINE struct libdivide_u32_t libdivide_internal_u32_gen(
+    uint32_t d, int branchfree) {
+    if (d == 0) {
+        LIBDIVIDE_ERROR("divider must be != 0");
+    }
+
+    struct libdivide_u32_t result;
+    uint32_t floor_log_2_d = 31 - libdivide_count_leading_zeros32(d);
 
     // Power of 2
     if ((d & (d - 1)) == 0) {
@@ -602,26 +866,25 @@ static inline struct libdivide_u32_t libdivide_internal_u32_gen(uint32_t d, int
     } else {
         uint8_t more;
         uint32_t rem, proposed_m;
-        uint32_t e;
-        proposed_m = libdivide_64_div_32_to_32(1U << floor_log_2_d, 0, d, &rem);
+        proposed_m = libdivide_64_div_32_to_32((uint32_t)1 << floor_log_2_d, 0, d, &rem);
 
         LIBDIVIDE_ASSERT(rem > 0 && rem < d);
-        e = d - rem;
+        const uint32_t e = d - rem;
 
         // This power works if e < 2**floor_log_2_d.
-        if (!branchfree && (e < (1U << floor_log_2_d))) {
+        if (!branchfree && (e < ((uint32_t)1 << floor_log_2_d))) {
             // This power works
-            more = floor_log_2_d;
+            more = (uint8_t)floor_log_2_d;
         } else {
             // We have to use the general 33-bit algorithm.  We need to compute
             // (2**power) / d. However, we already have (2**(power-1))/d and
             // its remainder.  By doubling both, and then correcting the
             // remainder, we can compute the larger division.
             // don't care about overflow here - in fact, we expect it
-            const uint32_t twice_rem = rem + rem;
             proposed_m += proposed_m;
+            const uint32_t twice_rem = rem + rem;
             if (twice_rem >= d || twice_rem < rem) proposed_m += 1;
-            more = floor_log_2_d | LIBDIVIDE_ADD_MARKER;
+            more = (uint8_t)(floor_log_2_d | LIBDIVIDE_ADD_MARKER);
         }
         result.magic = 1 + proposed_m;
         result.more = more;
@@ -638,27 +901,26 @@ struct libdivide_u32_t libdivide_u32_gen(uint32_t d) {
     return libdivide_internal_u32_gen(d, 0);
 }
 
-/*struct libdivide_u32_branchfree_t libdivide_u32_branchfree_gen(uint32_t d) {
+struct libdivide_u32_branchfree_t libdivide_u32_branchfree_gen(uint32_t d) {
     if (d == 1) {
         LIBDIVIDE_ERROR("branchfree divider must be != 1");
     }
     struct libdivide_u32_t tmp = libdivide_internal_u32_gen(d, 1);
-    struct libdivide_u32_branchfree_t ret = {tmp.magic, (uint8_t)(tmp.more & LIBDIVIDE_32_SHIFT_MASK)};
+    struct libdivide_u32_branchfree_t ret = {
+        tmp.magic, (uint8_t)(tmp.more & LIBDIVIDE_32_SHIFT_MASK)};
     return ret;
-}*/
+}
 
 uint32_t libdivide_u32_do(uint32_t numer, const struct libdivide_u32_t *denom) {
     uint8_t more = denom->more;
     if (!denom->magic) {
         return numer >> more;
-    }
-    else {
+    } else {
         uint32_t q = libdivide_mullhi_u32(denom->magic, numer);
         if (more & LIBDIVIDE_ADD_MARKER) {
             uint32_t t = ((numer - q) >> 1) + q;
             return t >> (more & LIBDIVIDE_32_SHIFT_MASK);
-        }
-        else {
+        } else {
             // All upper bits are 0,
             // don't need to mask them off.
             return q >> more;
@@ -666,7 +928,8 @@ uint32_t libdivide_u32_do(uint32_t numer, const struct libdivide_u32_t *denom) {
     }
 }
 
-/*uint32_t libdivide_u32_branchfree_do(uint32_t numer, const struct libdivide_u32_branchfree_t *denom) {
+uint32_t libdivide_u32_branchfree_do(
+    uint32_t numer, const struct libdivide_u32_branchfree_t *denom) {
     uint32_t q = libdivide_mullhi_u32(denom->magic, numer);
     uint32_t t = ((numer - q) >> 1) + q;
     return t >> denom->more;
@@ -677,14 +940,14 @@ uint32_t libdivide_u32_recover(const struct libdivide_u32_t *denom) {
     uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
 
     if (!denom->magic) {
-        return 1U << shift;
+        return (uint32_t)1 << shift;
     } else if (!(more & LIBDIVIDE_ADD_MARKER)) {
         // We compute q = n/d = n*m / 2^(32 + shift)
         // Therefore we have d = 2^(32 + shift) / m
         // We need to ceil it.
         // We know d is not a power of 2, so m is not a power of 2,
         // so we can just add 1 to the floor
-        uint32_t hi_dividend = 1U << shift;
+        uint32_t hi_dividend = (uint32_t)1 << shift;
         uint32_t rem_ignored;
         return 1 + libdivide_64_div_32_to_32(hi_dividend, 0, denom->magic, &rem_ignored);
     } else {
@@ -693,8 +956,8 @@ uint32_t libdivide_u32_recover(const struct libdivide_u32_t *denom) {
         // Also note that shift may be as high as 31, so shift + 1 will
         // overflow. So we have to compute it as 2^(32+shift)/(m+2^32), and
         // then double the quotient and remainder.
-        uint64_t half_n = 1ULL << (32 + shift);
-        uint64_t d = (1ULL << 32) | denom->magic;
+        uint64_t half_n = (uint64_t)1 << (32 + shift);
+        uint64_t d = ((uint64_t)1 << 32) | denom->magic;
         // Note that the quotient is guaranteed <= 32 bits, but the remainder
         // may need 33!
         uint32_t half_q = (uint32_t)(half_n / d);
@@ -703,7 +966,7 @@ uint32_t libdivide_u32_recover(const struct libdivide_u32_t *denom) {
         // Need to double it, and then add 1 to the quotient if doubling th
         // remainder would increase the quotient.
         // Note that rem<<1 cannot overflow, since rem < d and d is 33 bits
-        uint32_t full_q = half_q + half_q + ((rem<<1) >= d);
+        uint32_t full_q = half_q + half_q + ((rem << 1) >= d);
 
         // We rounded down in gen (hence +1)
         return full_q + 1;
@@ -715,15 +978,15 @@ uint32_t libdivide_u32_branchfree_recover(const struct libdivide_u32_branchfree_
     uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
 
     if (!denom->magic) {
-        return 1U << (shift + 1);
+        return (uint32_t)1 << (shift + 1);
     } else {
         // Here we wish to compute d = 2^(32+shift+1)/(m+2^32).
         // Notice (m + 2^32) is a 33 bit number. Use 64 bit division for now
         // Also note that shift may be as high as 31, so shift + 1 will
         // overflow. So we have to compute it as 2^(32+shift)/(m+2^32), and
         // then double the quotient and remainder.
-        uint64_t half_n = 1ULL << (32 + shift);
-        uint64_t d = (1ULL << 32) | denom->magic;
+        uint64_t half_n = (uint64_t)1 << (32 + shift);
+        uint64_t d = ((uint64_t)1 << 32) | denom->magic;
         // Note that the quotient is guaranteed <= 32 bits, but the remainder
         // may need 33!
         uint32_t half_q = (uint32_t)(half_n / d);
@@ -732,16 +995,17 @@ uint32_t libdivide_u32_branchfree_recover(const struct libdivide_u32_branchfree_
         // Need to double it, and then add 1 to the quotient if doubling th
         // remainder would increase the quotient.
         // Note that rem<<1 cannot overflow, since rem < d and d is 33 bits
-        uint32_t full_q = half_q + half_q + ((rem<<1) >= d);
+        uint32_t full_q = half_q + half_q + ((rem << 1) >= d);
 
         // We rounded down in gen (hence +1)
         return full_q + 1;
     }
-}*/
+}
 
 /////////// UINT64
 
-/*static inline struct libdivide_u64_t libdivide_internal_u64_gen(uint64_t d, int branchfree) {
+static LIBDIVIDE_INLINE struct libdivide_u64_t libdivide_internal_u64_gen(
+    uint64_t d, int branchfree) {
     if (d == 0) {
         LIBDIVIDE_ERROR("divider must be != 0");
     }
@@ -761,15 +1025,15 @@ uint32_t libdivide_u32_branchfree_recover(const struct libdivide_u32_branchfree_
         uint64_t proposed_m, rem;
         uint8_t more;
         // (1 << (64 + floor_log_2_d)) / d
-        proposed_m = libdivide_128_div_64_to_64(1ULL << floor_log_2_d, 0, d, &rem);
+        proposed_m = libdivide_128_div_64_to_64((uint64_t)1 << floor_log_2_d, 0, d, &rem);
 
         LIBDIVIDE_ASSERT(rem > 0 && rem < d);
         const uint64_t e = d - rem;
 
         // This power works if e < 2**floor_log_2_d.
-        if (!branchfree && e < (1ULL << floor_log_2_d)) {
+        if (!branchfree && e < ((uint64_t)1 << floor_log_2_d)) {
             // This power works
-            more = floor_log_2_d;
+            more = (uint8_t)floor_log_2_d;
         } else {
             // We have to use the general 65-bit algorithm.  We need to compute
             // (2**power) / d. However, we already have (2**(power-1))/d and
@@ -779,7 +1043,7 @@ uint32_t libdivide_u32_branchfree_recover(const struct libdivide_u32_branchfree_
             proposed_m += proposed_m;
             const uint64_t twice_rem = rem + rem;
             if (twice_rem >= d || twice_rem < rem) proposed_m += 1;
-                more = floor_log_2_d | LIBDIVIDE_ADD_MARKER;
+            more = (uint8_t)(floor_log_2_d | LIBDIVIDE_ADD_MARKER);
         }
         result.magic = 1 + proposed_m;
         result.more = more;
@@ -802,7 +1066,8 @@ struct libdivide_u64_branchfree_t libdivide_u64_branchfree_gen(uint64_t d) {
         LIBDIVIDE_ERROR("branchfree divider must be != 1");
     }
     struct libdivide_u64_t tmp = libdivide_internal_u64_gen(d, 1);
-    struct libdivide_u64_branchfree_t ret = {tmp.magic, (uint8_t)(tmp.more & LIBDIVIDE_64_SHIFT_MASK)};
+    struct libdivide_u64_branchfree_t ret = {
+        tmp.magic, (uint8_t)(tmp.more & LIBDIVIDE_64_SHIFT_MASK)};
     return ret;
 }
 
@@ -810,22 +1075,21 @@ uint64_t libdivide_u64_do(uint64_t numer, const struct libdivide_u64_t *denom) {
     uint8_t more = denom->more;
     if (!denom->magic) {
         return numer >> more;
-    }
-    else {
+    } else {
         uint64_t q = libdivide_mullhi_u64(denom->magic, numer);
         if (more & LIBDIVIDE_ADD_MARKER) {
             uint64_t t = ((numer - q) >> 1) + q;
             return t >> (more & LIBDIVIDE_64_SHIFT_MASK);
-        }
-        else {
-             // All upper bits are 0,
-             // don't need to mask them off.
+        } else {
+            // All upper bits are 0,
+            // don't need to mask them off.
             return q >> more;
         }
     }
 }
 
-uint64_t libdivide_u64_branchfree_do(uint64_t numer, const struct libdivide_u64_branchfree_t *denom) {
+uint64_t libdivide_u64_branchfree_do(
+    uint64_t numer, const struct libdivide_u64_branchfree_t *denom) {
     uint64_t q = libdivide_mullhi_u64(denom->magic, numer);
     uint64_t t = ((numer - q) >> 1) + q;
     return t >> denom->more;
@@ -836,14 +1100,14 @@ uint64_t libdivide_u64_recover(const struct libdivide_u64_t *denom) {
     uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
 
     if (!denom->magic) {
-        return 1ULL << shift;
+        return (uint64_t)1 << shift;
     } else if (!(more & LIBDIVIDE_ADD_MARKER)) {
         // We compute q = n/d = n*m / 2^(64 + shift)
         // Therefore we have d = 2^(64 + shift) / m
         // We need to ceil it.
         // We know d is not a power of 2, so m is not a power of 2,
         // so we can just add 1 to the floor
-        uint64_t hi_dividend = 1ULL << shift;
+        uint64_t hi_dividend = (uint64_t)1 << shift;
         uint64_t rem_ignored;
         return 1 + libdivide_128_div_64_to_64(hi_dividend, 0, denom->magic, &rem_ignored);
     } else {
@@ -855,19 +1119,20 @@ uint64_t libdivide_u64_recover(const struct libdivide_u64_t *denom) {
         // Full n is a (potentially) 129 bit value
         // half_n is a 128 bit value
         // Compute the hi half of half_n. Low half is 0.
-        uint64_t half_n_hi = 1ULL << shift, half_n_lo = 0;
+        uint64_t half_n_hi = (uint64_t)1 << shift, half_n_lo = 0;
         // d is a 65 bit value. The high bit is always set to 1.
         const uint64_t d_hi = 1, d_lo = denom->magic;
         // Note that the quotient is guaranteed <= 64 bits,
         // but the remainder may need 65!
         uint64_t r_hi, r_lo;
-        uint64_t half_q = libdivide_128_div_128_to_64(half_n_hi, half_n_lo, d_hi, d_lo, &r_hi, &r_lo);
+        uint64_t half_q =
+            libdivide_128_div_128_to_64(half_n_hi, half_n_lo, d_hi, d_lo, &r_hi, &r_lo);
         // We computed 2^(64+shift)/(m+2^64)
         // Double the remainder ('dr') and check if that is larger than d
         // Note that d is a 65 bit value, so r1 is small and so r1 + r1
         // cannot overflow
         uint64_t dr_lo = r_lo + r_lo;
-        uint64_t dr_hi = r_hi + r_hi + (dr_lo < r_lo); // last term is carry
+        uint64_t dr_hi = r_hi + r_hi + (dr_lo < r_lo);  // last term is carry
         int dr_exceeds_d = (dr_hi > d_hi) || (dr_hi == d_hi && dr_lo >= d_lo);
         uint64_t full_q = half_q + half_q + (dr_exceeds_d ? 1 : 0);
         return full_q + 1;
@@ -879,7 +1144,7 @@ uint64_t libdivide_u64_branchfree_recover(const struct libdivide_u64_branchfree_
     uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
 
     if (!denom->magic) {
-        return 1ULL << (shift + 1);
+        return (uint64_t)1 << (shift + 1);
     } else {
         // Here we wish to compute d = 2^(64+shift+1)/(m+2^64).
         // Notice (m + 2^64) is a 65 bit number. This gets hairy. See
@@ -889,28 +1154,205 @@ uint64_t libdivide_u64_branchfree_recover(const struct libdivide_u64_branchfree_
         // Full n is a (potentially) 129 bit value
         // half_n is a 128 bit value
         // Compute the hi half of half_n. Low half is 0.
-        uint64_t half_n_hi = 1ULL << shift, half_n_lo = 0;
+        uint64_t half_n_hi = (uint64_t)1 << shift, half_n_lo = 0;
         // d is a 65 bit value. The high bit is always set to 1.
         const uint64_t d_hi = 1, d_lo = denom->magic;
         // Note that the quotient is guaranteed <= 64 bits,
         // but the remainder may need 65!
         uint64_t r_hi, r_lo;
-        uint64_t half_q = libdivide_128_div_128_to_64(half_n_hi, half_n_lo, d_hi, d_lo, &r_hi, &r_lo);
+        uint64_t half_q =
+            libdivide_128_div_128_to_64(half_n_hi, half_n_lo, d_hi, d_lo, &r_hi, &r_lo);
         // We computed 2^(64+shift)/(m+2^64)
         // Double the remainder ('dr') and check if that is larger than d
         // Note that d is a 65 bit value, so r1 is small and so r1 + r1
         // cannot overflow
         uint64_t dr_lo = r_lo + r_lo;
-        uint64_t dr_hi = r_hi + r_hi + (dr_lo < r_lo); // last term is carry
+        uint64_t dr_hi = r_hi + r_hi + (dr_lo < r_lo);  // last term is carry
         int dr_exceeds_d = (dr_hi > d_hi) || (dr_hi == d_hi && dr_lo >= d_lo);
         uint64_t full_q = half_q + half_q + (dr_exceeds_d ? 1 : 0);
         return full_q + 1;
     }
-}*/
+}
+
+/////////// SINT16
+
+static LIBDIVIDE_INLINE struct libdivide_s16_t libdivide_internal_s16_gen(
+    int16_t d, int branchfree) {
+    if (d == 0) {
+        LIBDIVIDE_ERROR("divider must be != 0");
+    }
+
+    struct libdivide_s16_t result;
+
+    // If d is a power of 2, or negative a power of 2, we have to use a shift.
+    // This is especially important because the magic algorithm fails for -1.
+    // To check if d is a power of 2 or its inverse, it suffices to check
+    // whether its absolute value has exactly one bit set. This works even for
+    // INT_MIN, because abs(INT_MIN) == INT_MIN, and INT_MIN has one bit set
+    // and is a power of 2.
+    uint16_t ud = (uint16_t)d;
+    uint16_t absD = (d < 0) ? -ud : ud;
+    uint16_t floor_log_2_d = 15 - libdivide_count_leading_zeros16(absD);
+    // check if exactly one bit is set,
+    // don't care if absD is 0 since that's divide by zero
+    if ((absD & (absD - 1)) == 0) {
+        // Branchfree and normal paths are exactly the same
+        result.magic = 0;
+        result.more = (uint8_t)(floor_log_2_d | (d < 0 ? LIBDIVIDE_NEGATIVE_DIVISOR : 0));
+    } else {
+        LIBDIVIDE_ASSERT(floor_log_2_d >= 1);
+
+        uint8_t more;
+        // the dividend here is 2**(floor_log_2_d + 31), so the low 16 bit word
+        // is 0 and the high word is floor_log_2_d - 1
+        uint16_t rem, proposed_m;
+        proposed_m = libdivide_32_div_16_to_16((uint16_t)1 << (floor_log_2_d - 1), 0, absD, &rem);
+        const uint16_t e = absD - rem;
+
+        // We are going to start with a power of floor_log_2_d - 1.
+        // This works if works if e < 2**floor_log_2_d.
+        if (!branchfree && e < ((uint16_t)1 << floor_log_2_d)) {
+            // This power works
+            more = (uint8_t)(floor_log_2_d - 1);
+        } else {
+            // We need to go one higher. This should not make proposed_m
+            // overflow, but it will make it negative when interpreted as an
+            // int16_t.
+            proposed_m += proposed_m;
+            const uint16_t twice_rem = rem + rem;
+            if (twice_rem >= absD || twice_rem < rem) proposed_m += 1;
+            more = (uint8_t)(floor_log_2_d | LIBDIVIDE_ADD_MARKER);
+        }
+
+        proposed_m += 1;
+        int16_t magic = (int16_t)proposed_m;
+
+        // Mark if we are negative. Note we only negate the magic number in the
+        // branchfull case.
+        if (d < 0) {
+            more |= LIBDIVIDE_NEGATIVE_DIVISOR;
+            if (!branchfree) {
+                magic = -magic;
+            }
+        }
+
+        result.more = more;
+        result.magic = magic;
+    }
+    return result;
+}
+
+struct libdivide_s16_t libdivide_s16_gen(int16_t d) {
+    return libdivide_internal_s16_gen(d, 0);
+}
+
+struct libdivide_s16_branchfree_t libdivide_s16_branchfree_gen(int16_t d) {
+    struct libdivide_s16_t tmp = libdivide_internal_s16_gen(d, 1);
+    struct libdivide_s16_branchfree_t result = {tmp.magic, tmp.more};
+    return result;
+}
+
+// The original libdivide_s16_do takes a const pointer. However, this cannot be used
+// with a compile time constant libdivide_s16_t: it will generate a warning about
+// taking the address of a temporary. Hence this overload.
+int16_t libdivide_s16_do_raw(int16_t numer, int16_t magic, uint8_t more) {
+    uint8_t shift = more & LIBDIVIDE_16_SHIFT_MASK;
+
+    if (!magic) {
+        uint16_t sign = (int8_t)more >> 7;
+        uint16_t mask = ((uint16_t)1 << shift) - 1;
+        uint16_t uq = numer + ((numer >> 15) & mask);
+        int16_t q = (int16_t)uq;
+        q >>= shift;
+        q = (q ^ sign) - sign;
+        return q;
+    } else {
+        uint16_t uq = (uint16_t)libdivide_mullhi_s16(magic, numer);
+        if (more & LIBDIVIDE_ADD_MARKER) {
+            // must be arithmetic shift and then sign extend
+            int16_t sign = (int8_t)more >> 7;
+            // q += (more < 0 ? -numer : numer)
+            // cast required to avoid UB
+            uq += ((uint16_t)numer ^ sign) - sign;
+        }
+        int16_t q = (int16_t)uq;
+        q >>= shift;
+        q += (q < 0);
+        return q;
+    }
+}
+
+int16_t libdivide_s16_do(int16_t numer, const struct libdivide_s16_t *denom) {
+    return libdivide_s16_do_raw(numer, denom->magic, denom->more);
+}
+
+int16_t libdivide_s16_branchfree_do(int16_t numer, const struct libdivide_s16_branchfree_t *denom) {
+    uint8_t more = denom->more;
+    uint8_t shift = more & LIBDIVIDE_16_SHIFT_MASK;
+    // must be arithmetic shift and then sign extend
+    int16_t sign = (int8_t)more >> 7;
+    int16_t magic = denom->magic;
+    int16_t q = libdivide_mullhi_s16(magic, numer);
+    q += numer;
+
+    // If q is non-negative, we have nothing to do
+    // If q is negative, we want to add either (2**shift)-1 if d is a power of
+    // 2, or (2**shift) if it is not a power of 2
+    uint16_t is_power_of_2 = (magic == 0);
+    uint16_t q_sign = (uint16_t)(q >> 15);
+    q += q_sign & (((uint16_t)1 << shift) - is_power_of_2);
+
+    // Now arithmetic right shift
+    q >>= shift;
+    // Negate if needed
+    q = (q ^ sign) - sign;
+
+    return q;
+}
+
+int16_t libdivide_s16_recover(const struct libdivide_s16_t *denom) {
+    uint8_t more = denom->more;
+    uint8_t shift = more & LIBDIVIDE_16_SHIFT_MASK;
+    if (!denom->magic) {
+        uint16_t absD = (uint16_t)1 << shift;
+        if (more & LIBDIVIDE_NEGATIVE_DIVISOR) {
+            absD = -absD;
+        }
+        return (int16_t)absD;
+    } else {
+        // Unsigned math is much easier
+        // We negate the magic number only in the branchfull case, and we don't
+        // know which case we're in. However we have enough information to
+        // determine the correct sign of the magic number. The divisor was
+        // negative if LIBDIVIDE_NEGATIVE_DIVISOR is set. If ADD_MARKER is set,
+        // the magic number's sign is opposite that of the divisor.
+        // We want to compute the positive magic number.
+        int negative_divisor = (more & LIBDIVIDE_NEGATIVE_DIVISOR);
+        int magic_was_negated = (more & LIBDIVIDE_ADD_MARKER) ? denom->magic > 0 : denom->magic < 0;
+
+        // Handle the power of 2 case (including branchfree)
+        if (denom->magic == 0) {
+            int16_t result = (uint16_t)1 << shift;
+            return negative_divisor ? -result : result;
+        }
+
+        uint16_t d = (uint16_t)(magic_was_negated ? -denom->magic : denom->magic);
+        uint32_t n = (uint32_t)1 << (16 + shift);  // this shift cannot exceed 30
+        uint16_t q = (uint16_t)(n / d);
+        int16_t result = (int16_t)q;
+        result += 1;
+        return negative_divisor ? -result : result;
+    }
+}
+
+int16_t libdivide_s16_branchfree_recover(const struct libdivide_s16_branchfree_t *denom) {
+    return libdivide_s16_recover((const struct libdivide_s16_t *)denom);
+}
 
 /////////// SINT32
 
-/*static inline struct libdivide_s32_t libdivide_internal_s32_gen(int32_t d, int branchfree) {
+static LIBDIVIDE_INLINE struct libdivide_s32_t libdivide_internal_s32_gen(
+    int32_t d, int branchfree) {
     if (d == 0) {
         LIBDIVIDE_ERROR("divider must be != 0");
     }
@@ -931,7 +1373,7 @@ uint64_t libdivide_u64_branchfree_recover(const struct libdivide_u64_branchfree_
     if ((absD & (absD - 1)) == 0) {
         // Branchfree and normal paths are exactly the same
         result.magic = 0;
-        result.more = floor_log_2_d | (d < 0 ? LIBDIVIDE_NEGATIVE_DIVISOR : 0);
+        result.more = (uint8_t)(floor_log_2_d | (d < 0 ? LIBDIVIDE_NEGATIVE_DIVISOR : 0));
     } else {
         LIBDIVIDE_ASSERT(floor_log_2_d >= 1);
 
@@ -939,14 +1381,14 @@ uint64_t libdivide_u64_branchfree_recover(const struct libdivide_u64_branchfree_
         // the dividend here is 2**(floor_log_2_d + 31), so the low 32 bit word
         // is 0 and the high word is floor_log_2_d - 1
         uint32_t rem, proposed_m;
-        proposed_m = libdivide_64_div_32_to_32(1U << (floor_log_2_d - 1), 0, absD, &rem);
+        proposed_m = libdivide_64_div_32_to_32((uint32_t)1 << (floor_log_2_d - 1), 0, absD, &rem);
         const uint32_t e = absD - rem;
 
         // We are going to start with a power of floor_log_2_d - 1.
         // This works if works if e < 2**floor_log_2_d.
-        if (!branchfree && e < (1U << floor_log_2_d)) {
+        if (!branchfree && e < ((uint32_t)1 << floor_log_2_d)) {
             // This power works
-            more = floor_log_2_d - 1;
+            more = (uint8_t)(floor_log_2_d - 1);
         } else {
             // We need to go one higher. This should not make proposed_m
             // overflow, but it will make it negative when interpreted as an
@@ -954,7 +1396,7 @@ uint64_t libdivide_u64_branchfree_recover(const struct libdivide_u64_branchfree_
             proposed_m += proposed_m;
             const uint32_t twice_rem = rem + rem;
             if (twice_rem >= absD || twice_rem < rem) proposed_m += 1;
-            more = floor_log_2_d | LIBDIVIDE_ADD_MARKER;
+            more = (uint8_t)(floor_log_2_d | LIBDIVIDE_ADD_MARKER);
         }
 
         proposed_m += 1;
@@ -991,7 +1433,7 @@ int32_t libdivide_s32_do(int32_t numer, const struct libdivide_s32_t *denom) {
 
     if (!denom->magic) {
         uint32_t sign = (int8_t)more >> 7;
-        uint32_t mask = (1U << shift) - 1;
+        uint32_t mask = ((uint32_t)1 << shift) - 1;
         uint32_t uq = numer + ((numer >> 31) & mask);
         int32_t q = (int32_t)uq;
         q >>= shift;
@@ -1027,7 +1469,7 @@ int32_t libdivide_s32_branchfree_do(int32_t numer, const struct libdivide_s32_br
     // 2, or (2**shift) if it is not a power of 2
     uint32_t is_power_of_2 = (magic == 0);
     uint32_t q_sign = (uint32_t)(q >> 31);
-    q += q_sign & ((1U << shift) - is_power_of_2);
+    q += q_sign & (((uint32_t)1 << shift) - is_power_of_2);
 
     // Now arithmetic right shift
     q >>= shift;
@@ -1041,7 +1483,7 @@ int32_t libdivide_s32_recover(const struct libdivide_s32_t *denom) {
     uint8_t more = denom->more;
     uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
     if (!denom->magic) {
-        uint32_t absD = 1U << shift;
+        uint32_t absD = (uint32_t)1 << shift;
         if (more & LIBDIVIDE_NEGATIVE_DIVISOR) {
             absD = -absD;
         }
@@ -1055,17 +1497,16 @@ int32_t libdivide_s32_recover(const struct libdivide_s32_t *denom) {
         // the magic number's sign is opposite that of the divisor.
         // We want to compute the positive magic number.
         int negative_divisor = (more & LIBDIVIDE_NEGATIVE_DIVISOR);
-        int magic_was_negated = (more & LIBDIVIDE_ADD_MARKER)
-            ? denom->magic > 0 : denom->magic < 0;
+        int magic_was_negated = (more & LIBDIVIDE_ADD_MARKER) ? denom->magic > 0 : denom->magic < 0;
 
         // Handle the power of 2 case (including branchfree)
         if (denom->magic == 0) {
-            int32_t result = 1U << shift;
+            int32_t result = (uint32_t)1 << shift;
             return negative_divisor ? -result : result;
         }
 
         uint32_t d = (uint32_t)(magic_was_negated ? -denom->magic : denom->magic);
-        uint64_t n = 1ULL << (32 + shift); // this shift cannot exceed 30
+        uint64_t n = (uint64_t)1 << (32 + shift);  // this shift cannot exceed 30
         uint32_t q = (uint32_t)(n / d);
         int32_t result = (int32_t)q;
         result += 1;
@@ -1075,11 +1516,12 @@ int32_t libdivide_s32_recover(const struct libdivide_s32_t *denom) {
 
 int32_t libdivide_s32_branchfree_recover(const struct libdivide_s32_branchfree_t *denom) {
     return libdivide_s32_recover((const struct libdivide_s32_t *)denom);
-}*/
+}
 
 ///////////// SINT64
 
-/*static inline struct libdivide_s64_t libdivide_internal_s64_gen(int64_t d, int branchfree) {
+static LIBDIVIDE_INLINE struct libdivide_s64_t libdivide_internal_s64_gen(
+    int64_t d, int branchfree) {
     if (d == 0) {
         LIBDIVIDE_ERROR("divider must be != 0");
     }
@@ -1100,20 +1542,20 @@ int32_t libdivide_s32_branchfree_recover(const struct libdivide_s32_branchfree_t
     if ((absD & (absD - 1)) == 0) {
         // Branchfree and non-branchfree cases are the same
         result.magic = 0;
-        result.more = floor_log_2_d | (d < 0 ? LIBDIVIDE_NEGATIVE_DIVISOR : 0);
+        result.more = (uint8_t)(floor_log_2_d | (d < 0 ? LIBDIVIDE_NEGATIVE_DIVISOR : 0));
     } else {
         // the dividend here is 2**(floor_log_2_d + 63), so the low 64 bit word
         // is 0 and the high word is floor_log_2_d - 1
         uint8_t more;
         uint64_t rem, proposed_m;
-        proposed_m = libdivide_128_div_64_to_64(1ULL << (floor_log_2_d - 1), 0, absD, &rem);
+        proposed_m = libdivide_128_div_64_to_64((uint64_t)1 << (floor_log_2_d - 1), 0, absD, &rem);
         const uint64_t e = absD - rem;
 
         // We are going to start with a power of floor_log_2_d - 1.
         // This works if works if e < 2**floor_log_2_d.
-        if (!branchfree && e < (1ULL << floor_log_2_d)) {
+        if (!branchfree && e < ((uint64_t)1 << floor_log_2_d)) {
             // This power works
-            more = floor_log_2_d - 1;
+            more = (uint8_t)(floor_log_2_d - 1);
         } else {
             // We need to go one higher. This should not make proposed_m
             // overflow, but it will make it negative when interpreted as an
@@ -1125,7 +1567,7 @@ int32_t libdivide_s32_branchfree_recover(const struct libdivide_s32_branchfree_t
             // also set ADD_MARKER this is an annoying optimization that
             // enables algorithm #4 to avoid the mask. However we always set it
             // in the branchfree case
-            more = floor_log_2_d | LIBDIVIDE_ADD_MARKER;
+            more = (uint8_t)(floor_log_2_d | LIBDIVIDE_ADD_MARKER);
         }
         proposed_m += 1;
         int64_t magic = (int64_t)proposed_m;
@@ -1158,8 +1600,8 @@ int64_t libdivide_s64_do(int64_t numer, const struct libdivide_s64_t *denom) {
     uint8_t more = denom->more;
     uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
 
-    if (!denom->magic) { // shift path
-        uint64_t mask = (1ULL << shift) - 1;
+    if (!denom->magic) {  // shift path
+        uint64_t mask = ((uint64_t)1 << shift) - 1;
         uint64_t uq = numer + ((numer >> 63) & mask);
         int64_t q = (int64_t)uq;
         q >>= shift;
@@ -1197,7 +1639,7 @@ int64_t libdivide_s64_branchfree_do(int64_t numer, const struct libdivide_s64_br
     // 2, or (2**shift) if it is not a power of 2.
     uint64_t is_power_of_2 = (magic == 0);
     uint64_t q_sign = (uint64_t)(q >> 63);
-    q += q_sign & ((1ULL << shift) - is_power_of_2);
+    q += q_sign & (((uint64_t)1 << shift) - is_power_of_2);
 
     // Arithmetic right shift
     q >>= shift;
@@ -1210,8 +1652,8 @@ int64_t libdivide_s64_branchfree_do(int64_t numer, const struct libdivide_s64_br
 int64_t libdivide_s64_recover(const struct libdivide_s64_t *denom) {
     uint8_t more = denom->more;
     uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
-    if (denom->magic == 0) { // shift path
-        uint64_t absD = 1ULL << shift;
+    if (denom->magic == 0) {  // shift path
+        uint64_t absD = (uint64_t)1 << shift;
         if (more & LIBDIVIDE_NEGATIVE_DIVISOR) {
             absD = -absD;
         }
@@ -1219,11 +1661,10 @@ int64_t libdivide_s64_recover(const struct libdivide_s64_t *denom) {
     } else {
         // Unsigned math is much easier
         int negative_divisor = (more & LIBDIVIDE_NEGATIVE_DIVISOR);
-        int magic_was_negated = (more & LIBDIVIDE_ADD_MARKER)
-            ? denom->magic > 0 : denom->magic < 0;
+        int magic_was_negated = (more & LIBDIVIDE_ADD_MARKER) ? denom->magic > 0 : denom->magic < 0;
 
         uint64_t d = (uint64_t)(magic_was_negated ? -denom->magic : denom->magic);
-        uint64_t n_hi = 1ULL << shift, n_lo = 0;
+        uint64_t n_hi = (uint64_t)1 << shift, n_lo = 0;
         uint64_t rem_ignored;
         uint64_t q = libdivide_128_div_64_to_64(n_hi, n_lo, d, &rem_ignored);
         int64_t result = (int64_t)(q + 1);
@@ -1236,32 +1677,364 @@ int64_t libdivide_s64_recover(const struct libdivide_s64_t *denom) {
 
 int64_t libdivide_s64_branchfree_recover(const struct libdivide_s64_branchfree_t *denom) {
     return libdivide_s64_recover((const struct libdivide_s64_t *)denom);
-}*/
+}
 
-#if defined(LIBDIVIDE_AVX512)
+// Simplest possible vector type division: treat the vector type as an array
+// of underlying native type.
+//
+// Use a union to read a vector via pointer-to-integer, without violating strict
+// aliasing.
+#define SIMPLE_VECTOR_DIVISION(IntT, VecT, Algo)                          \
+    const size_t count = sizeof(VecT) / sizeof(IntT);                     \
+    union type_pun_vec {                                                  \
+        VecT vec;                                                         \
+        IntT arr[sizeof(VecT) / sizeof(IntT)];                            \
+    };                                                                    \
+    union type_pun_vec result;                                            \
+    union type_pun_vec input;                                             \
+    input.vec = numers;                                                   \
+    for (size_t loop = 0; loop < count; ++loop) {                         \
+        result.arr[loop] = libdivide_##Algo##_do(input.arr[loop], denom); \
+    }                                                                     \
+    return result.vec;
 
-static inline __m512i libdivide_u32_do_vector(__m512i numers, const struct libdivide_u32_t *denom);
-static inline __m512i libdivide_s32_do_vector(__m512i numers, const struct libdivide_s32_t *denom);
-static inline __m512i libdivide_u64_do_vector(__m512i numers, const struct libdivide_u64_t *denom);
-static inline __m512i libdivide_s64_do_vector(__m512i numers, const struct libdivide_s64_t *denom);
+#if defined(LIBDIVIDE_NEON)
 
-static inline __m512i libdivide_u32_branchfree_do_vector(__m512i numers, const struct libdivide_u32_branchfree_t *denom);
-static inline __m512i libdivide_s32_branchfree_do_vector(__m512i numers, const struct libdivide_s32_branchfree_t *denom);
-static inline __m512i libdivide_u64_branchfree_do_vector(__m512i numers, const struct libdivide_u64_branchfree_t *denom);
-static inline __m512i libdivide_s64_branchfree_do_vector(__m512i numers, const struct libdivide_s64_branchfree_t *denom);
+static LIBDIVIDE_INLINE uint16x8_t libdivide_u16_do_vec128(
+    uint16x8_t numers, const struct libdivide_u16_t *denom);
+static LIBDIVIDE_INLINE int16x8_t libdivide_s16_do_vec128(
+    int16x8_t numers, const struct libdivide_s16_t *denom);
+static LIBDIVIDE_INLINE uint32x4_t libdivide_u32_do_vec128(
+    uint32x4_t numers, const struct libdivide_u32_t *denom);
+static LIBDIVIDE_INLINE int32x4_t libdivide_s32_do_vec128(
+    int32x4_t numers, const struct libdivide_s32_t *denom);
+static LIBDIVIDE_INLINE uint64x2_t libdivide_u64_do_vec128(
+    uint64x2_t numers, const struct libdivide_u64_t *denom);
+static LIBDIVIDE_INLINE int64x2_t libdivide_s64_do_vec128(
+    int64x2_t numers, const struct libdivide_s64_t *denom);
+
+static LIBDIVIDE_INLINE uint16x8_t libdivide_u16_branchfree_do_vec128(
+    uint16x8_t numers, const struct libdivide_u16_branchfree_t *denom);
+static LIBDIVIDE_INLINE int16x8_t libdivide_s16_branchfree_do_vec128(
+    int16x8_t numers, const struct libdivide_s16_branchfree_t *denom);
+static LIBDIVIDE_INLINE uint32x4_t libdivide_u32_branchfree_do_vec128(
+    uint32x4_t numers, const struct libdivide_u32_branchfree_t *denom);
+static LIBDIVIDE_INLINE int32x4_t libdivide_s32_branchfree_do_vec128(
+    int32x4_t numers, const struct libdivide_s32_branchfree_t *denom);
+static LIBDIVIDE_INLINE uint64x2_t libdivide_u64_branchfree_do_vec128(
+    uint64x2_t numers, const struct libdivide_u64_branchfree_t *denom);
+static LIBDIVIDE_INLINE int64x2_t libdivide_s64_branchfree_do_vec128(
+    int64x2_t numers, const struct libdivide_s64_branchfree_t *denom);
 
 //////// Internal Utility Functions
 
-static inline __m512i libdivide_s64_signbits(__m512i v) {;
+// Logical right shift by runtime value.
+// NEON implements right shift as left shits by negative values.
+static LIBDIVIDE_INLINE uint32x4_t libdivide_u32_neon_srl(uint32x4_t v, uint8_t amt) {
+    int32_t wamt = (int32_t)(amt);
+    return vshlq_u32(v, vdupq_n_s32(-wamt));
+}
+
+static LIBDIVIDE_INLINE uint64x2_t libdivide_u64_neon_srl(uint64x2_t v, uint8_t amt) {
+    int64_t wamt = (int64_t)(amt);
+    return vshlq_u64(v, vdupq_n_s64(-wamt));
+}
+
+// Arithmetic right shift by runtime value.
+static LIBDIVIDE_INLINE int32x4_t libdivide_s32_neon_sra(int32x4_t v, uint8_t amt) {
+    int32_t wamt = (int32_t)(amt);
+    return vshlq_s32(v, vdupq_n_s32(-wamt));
+}
+
+static LIBDIVIDE_INLINE int64x2_t libdivide_s64_neon_sra(int64x2_t v, uint8_t amt) {
+    int64_t wamt = (int64_t)(amt);
+    return vshlq_s64(v, vdupq_n_s64(-wamt));
+}
+
+static LIBDIVIDE_INLINE int64x2_t libdivide_s64_signbits(int64x2_t v) { return vshrq_n_s64(v, 63); }
+
+static LIBDIVIDE_INLINE uint32x4_t libdivide_mullhi_u32_vec128(uint32x4_t a, uint32_t b) {
+    // Desire is [x0, x1, x2, x3]
+    uint32x4_t w1 = vreinterpretq_u32_u64(vmull_n_u32(vget_low_u32(a), b));  // [_, x0, _, x1]
+    uint32x4_t w2 = vreinterpretq_u32_u64(vmull_high_n_u32(a, b));           //[_, x2, _, x3]
+    return vuzp2q_u32(w1, w2);                                               // [x0, x1, x2, x3]
+}
+
+static LIBDIVIDE_INLINE int32x4_t libdivide_mullhi_s32_vec128(int32x4_t a, int32_t b) {
+    int32x4_t w1 = vreinterpretq_s32_s64(vmull_n_s32(vget_low_s32(a), b));  // [_, x0, _, x1]
+    int32x4_t w2 = vreinterpretq_s32_s64(vmull_high_n_s32(a, b));           //[_, x2, _, x3]
+    return vuzp2q_s32(w1, w2);                                              // [x0, x1, x2, x3]
+}
+
+static LIBDIVIDE_INLINE uint64x2_t libdivide_mullhi_u64_vec128(uint64x2_t x, uint64_t sy) {
+    // full 128 bits product is:
+    // x0*y0 + (x0*y1 << 32) + (x1*y0 << 32) + (x1*y1 << 64)
+    // Note x0,y0,x1,y1 are all conceptually uint32, products are 32x32->64.
+
+    // Get low and high words. x0 contains low 32 bits, x1 is high 32 bits.
+    uint64x2_t y = vdupq_n_u64(sy);
+    uint32x2_t x0 = vmovn_u64(x);
+    uint32x2_t y0 = vmovn_u64(y);
+    uint32x2_t x1 = vshrn_n_u64(x, 32);
+    uint32x2_t y1 = vshrn_n_u64(y, 32);
+
+    // Compute x0*y0.
+    uint64x2_t x0y0 = vmull_u32(x0, y0);
+    uint64x2_t x0y0_hi = vshrq_n_u64(x0y0, 32);
+
+    // Compute other intermediate products.
+    uint64x2_t temp = vmlal_u32(x0y0_hi, x1, y0);  // temp = x0y0_hi + x1*y0;
+    // We want to split temp into its low 32 bits and high 32 bits, both
+    // in the low half of 64 bit registers.
+    // Use shifts to avoid needing a reg for the mask.
+    uint64x2_t temp_lo = vshrq_n_u64(vshlq_n_u64(temp, 32), 32);  // temp_lo = temp & 0xFFFFFFFF;
+    uint64x2_t temp_hi = vshrq_n_u64(temp, 32);                   // temp_hi = temp >> 32;
+
+    temp_lo = vmlal_u32(temp_lo, x0, y1);  // temp_lo += x0*y0
+    temp_lo = vshrq_n_u64(temp_lo, 32);    // temp_lo >>= 32
+    temp_hi = vmlal_u32(temp_hi, x1, y1);  // temp_hi += x1*y1
+    uint64x2_t result = vaddq_u64(temp_hi, temp_lo);
+    return result;
+}
+
+static LIBDIVIDE_INLINE int64x2_t libdivide_mullhi_s64_vec128(int64x2_t x, int64_t sy) {
+    int64x2_t p = vreinterpretq_s64_u64(
+        libdivide_mullhi_u64_vec128(vreinterpretq_u64_s64(x), (uint64_t)(sy)));
+    int64x2_t y = vdupq_n_s64(sy);
+    int64x2_t t1 = vandq_s64(libdivide_s64_signbits(x), y);
+    int64x2_t t2 = vandq_s64(libdivide_s64_signbits(y), x);
+    p = vsubq_s64(p, t1);
+    p = vsubq_s64(p, t2);
+    return p;
+}
+
+////////// UINT16
+
+uint16x8_t libdivide_u16_do_vec128(uint16x8_t numers, const struct libdivide_u16_t *denom){
+    SIMPLE_VECTOR_DIVISION(uint16_t, uint16x8_t, u16)}
+
+uint16x8_t libdivide_u16_branchfree_do_vec128(
+    uint16x8_t numers, const struct libdivide_u16_branchfree_t *denom){
+    SIMPLE_VECTOR_DIVISION(uint16_t, uint16x8_t, u16_branchfree)}
+
+////////// UINT32
+
+uint32x4_t libdivide_u32_do_vec128(uint32x4_t numers, const struct libdivide_u32_t *denom) {
+    uint8_t more = denom->more;
+    if (!denom->magic) {
+        return libdivide_u32_neon_srl(numers, more);
+    } else {
+        uint32x4_t q = libdivide_mullhi_u32_vec128(numers, denom->magic);
+        if (more & LIBDIVIDE_ADD_MARKER) {
+            // uint32_t t = ((numer - q) >> 1) + q;
+            // return t >> denom->shift;
+            // Note we can use halving-subtract to avoid the shift.
+            uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
+            uint32x4_t t = vaddq_u32(vhsubq_u32(numers, q), q);
+            return libdivide_u32_neon_srl(t, shift);
+        } else {
+            return libdivide_u32_neon_srl(q, more);
+        }
+    }
+}
+
+uint32x4_t libdivide_u32_branchfree_do_vec128(
+    uint32x4_t numers, const struct libdivide_u32_branchfree_t *denom) {
+    uint32x4_t q = libdivide_mullhi_u32_vec128(numers, denom->magic);
+    uint32x4_t t = vaddq_u32(vhsubq_u32(numers, q), q);
+    return libdivide_u32_neon_srl(t, denom->more);
+}
+
+////////// UINT64
+
+uint64x2_t libdivide_u64_do_vec128(uint64x2_t numers, const struct libdivide_u64_t *denom) {
+    uint8_t more = denom->more;
+    if (!denom->magic) {
+        return libdivide_u64_neon_srl(numers, more);
+    } else {
+        uint64x2_t q = libdivide_mullhi_u64_vec128(numers, denom->magic);
+        if (more & LIBDIVIDE_ADD_MARKER) {
+            // uint32_t t = ((numer - q) >> 1) + q;
+            // return t >> denom->shift;
+            // No 64-bit halving subtracts in NEON :(
+            uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
+            uint64x2_t t = vaddq_u64(vshrq_n_u64(vsubq_u64(numers, q), 1), q);
+            return libdivide_u64_neon_srl(t, shift);
+        } else {
+            return libdivide_u64_neon_srl(q, more);
+        }
+    }
+}
+
+uint64x2_t libdivide_u64_branchfree_do_vec128(
+    uint64x2_t numers, const struct libdivide_u64_branchfree_t *denom) {
+    uint64x2_t q = libdivide_mullhi_u64_vec128(numers, denom->magic);
+    uint64x2_t t = vaddq_u64(vshrq_n_u64(vsubq_u64(numers, q), 1), q);
+    return libdivide_u64_neon_srl(t, denom->more);
+}
+
+////////// SINT16
+
+int16x8_t libdivide_s16_do_vec128(int16x8_t numers, const struct libdivide_s16_t *denom){
+    SIMPLE_VECTOR_DIVISION(int16_t, int16x8_t, s16)}
+
+int16x8_t libdivide_s16_branchfree_do_vec128(
+    int16x8_t numers, const struct libdivide_s16_branchfree_t *denom){
+    SIMPLE_VECTOR_DIVISION(int16_t, int16x8_t, s16_branchfree)}
+
+////////// SINT32
+
+int32x4_t libdivide_s32_do_vec128(int32x4_t numers, const struct libdivide_s32_t *denom) {
+    uint8_t more = denom->more;
+    if (!denom->magic) {
+        uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
+        uint32_t mask = ((uint32_t)1 << shift) - 1;
+        int32x4_t roundToZeroTweak = vdupq_n_s32((int)mask);
+        // q = numer + ((numer >> 31) & roundToZeroTweak);
+        int32x4_t q = vaddq_s32(numers, vandq_s32(vshrq_n_s32(numers, 31), roundToZeroTweak));
+        q = libdivide_s32_neon_sra(q, shift);
+        int32x4_t sign = vdupq_n_s32((int8_t)more >> 7);
+        // q = (q ^ sign) - sign;
+        q = vsubq_s32(veorq_s32(q, sign), sign);
+        return q;
+    } else {
+        int32x4_t q = libdivide_mullhi_s32_vec128(numers, denom->magic);
+        if (more & LIBDIVIDE_ADD_MARKER) {
+            // must be arithmetic shift
+            int32x4_t sign = vdupq_n_s32((int8_t)more >> 7);
+            // q += ((numer ^ sign) - sign);
+            q = vaddq_s32(q, vsubq_s32(veorq_s32(numers, sign), sign));
+        }
+        // q >>= shift
+        q = libdivide_s32_neon_sra(q, more & LIBDIVIDE_32_SHIFT_MASK);
+        q = vaddq_s32(
+            q, vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(q), 31)));  // q += (q < 0)
+        return q;
+    }
+}
+
+int32x4_t libdivide_s32_branchfree_do_vec128(
+    int32x4_t numers, const struct libdivide_s32_branchfree_t *denom) {
+    int32_t magic = denom->magic;
+    uint8_t more = denom->more;
+    uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
+    // must be arithmetic shift
+    int32x4_t sign = vdupq_n_s32((int8_t)more >> 7);
+    int32x4_t q = libdivide_mullhi_s32_vec128(numers, magic);
+    q = vaddq_s32(q, numers);  // q += numers
+
+    // If q is non-negative, we have nothing to do
+    // If q is negative, we want to add either (2**shift)-1 if d is
+    // a power of 2, or (2**shift) if it is not a power of 2
+    uint32_t is_power_of_2 = (magic == 0);
+    int32x4_t q_sign = vshrq_n_s32(q, 31);  // q_sign = q >> 31
+    int32x4_t mask = vdupq_n_s32(((uint32_t)1 << shift) - is_power_of_2);
+    q = vaddq_s32(q, vandq_s32(q_sign, mask));  // q = q + (q_sign & mask)
+    q = libdivide_s32_neon_sra(q, shift);       // q >>= shift
+    q = vsubq_s32(veorq_s32(q, sign), sign);    // q = (q ^ sign) - sign
+    return q;
+}
+
+////////// SINT64
+
+int64x2_t libdivide_s64_do_vec128(int64x2_t numers, const struct libdivide_s64_t *denom) {
+    uint8_t more = denom->more;
+    int64_t magic = denom->magic;
+    if (magic == 0) {  // shift path
+        uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
+        uint64_t mask = ((uint64_t)1 << shift) - 1;
+        int64x2_t roundToZeroTweak = vdupq_n_s64(mask);  // TODO: no need to sign extend
+        // q = numer + ((numer >> 63) & roundToZeroTweak);
+        int64x2_t q =
+            vaddq_s64(numers, vandq_s64(libdivide_s64_signbits(numers), roundToZeroTweak));
+        q = libdivide_s64_neon_sra(q, shift);
+        // q = (q ^ sign) - sign;
+        int64x2_t sign = vreinterpretq_s64_s8(vdupq_n_s8((int8_t)more >> 7));
+        q = vsubq_s64(veorq_s64(q, sign), sign);
+        return q;
+    } else {
+        int64x2_t q = libdivide_mullhi_s64_vec128(numers, magic);
+        if (more & LIBDIVIDE_ADD_MARKER) {
+            // must be arithmetic shift
+            int64x2_t sign = vdupq_n_s64((int8_t)more >> 7);  // TODO: no need to widen
+            // q += ((numer ^ sign) - sign);
+            q = vaddq_s64(q, vsubq_s64(veorq_s64(numers, sign), sign));
+        }
+        // q >>= denom->mult_path.shift
+        q = libdivide_s64_neon_sra(q, more & LIBDIVIDE_64_SHIFT_MASK);
+        q = vaddq_s64(
+            q, vreinterpretq_s64_u64(vshrq_n_u64(vreinterpretq_u64_s64(q), 63)));  // q += (q < 0)
+        return q;
+    }
+}
+
+int64x2_t libdivide_s64_branchfree_do_vec128(
+    int64x2_t numers, const struct libdivide_s64_branchfree_t *denom) {
+    int64_t magic = denom->magic;
+    uint8_t more = denom->more;
+    uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
+    // must be arithmetic shift
+    int64x2_t sign = vdupq_n_s64((int8_t)more >> 7);  // TODO: avoid sign extend
+
+    // libdivide_mullhi_s64(numers, magic);
+    int64x2_t q = libdivide_mullhi_s64_vec128(numers, magic);
+    q = vaddq_s64(q, numers);  // q += numers
+
+    // If q is non-negative, we have nothing to do.
+    // If q is negative, we want to add either (2**shift)-1 if d is
+    // a power of 2, or (2**shift) if it is not a power of 2.
+    uint32_t is_power_of_2 = (magic == 0);
+    int64x2_t q_sign = libdivide_s64_signbits(q);  // q_sign = q >> 63
+    int64x2_t mask = vdupq_n_s64(((uint64_t)1 << shift) - is_power_of_2);
+    q = vaddq_s64(q, vandq_s64(q_sign, mask));  // q = q + (q_sign & mask)
+    q = libdivide_s64_neon_sra(q, shift);       // q >>= shift
+    q = vsubq_s64(veorq_s64(q, sign), sign);    // q = (q ^ sign) - sign
+    return q;
+}
+
+#endif
+
+#if defined(LIBDIVIDE_AVX512)
+
+static LIBDIVIDE_INLINE __m512i libdivide_u16_do_vec512(
+    __m512i numers, const struct libdivide_u16_t *denom);
+static LIBDIVIDE_INLINE __m512i libdivide_s16_do_vec512(
+    __m512i numers, const struct libdivide_s16_t *denom);
+static LIBDIVIDE_INLINE __m512i libdivide_u32_do_vec512(
+    __m512i numers, const struct libdivide_u32_t *denom);
+static LIBDIVIDE_INLINE __m512i libdivide_s32_do_vec512(
+    __m512i numers, const struct libdivide_s32_t *denom);
+static LIBDIVIDE_INLINE __m512i libdivide_u64_do_vec512(
+    __m512i numers, const struct libdivide_u64_t *denom);
+static LIBDIVIDE_INLINE __m512i libdivide_s64_do_vec512(
+    __m512i numers, const struct libdivide_s64_t *denom);
+
+static LIBDIVIDE_INLINE __m512i libdivide_u16_branchfree_do_vec512(
+    __m512i numers, const struct libdivide_u16_branchfree_t *denom);
+static LIBDIVIDE_INLINE __m512i libdivide_s16_branchfree_do_vec512(
+    __m512i numers, const struct libdivide_s16_branchfree_t *denom);
+static LIBDIVIDE_INLINE __m512i libdivide_u32_branchfree_do_vec512(
+    __m512i numers, const struct libdivide_u32_branchfree_t *denom);
+static LIBDIVIDE_INLINE __m512i libdivide_s32_branchfree_do_vec512(
+    __m512i numers, const struct libdivide_s32_branchfree_t *denom);
+static LIBDIVIDE_INLINE __m512i libdivide_u64_branchfree_do_vec512(
+    __m512i numers, const struct libdivide_u64_branchfree_t *denom);
+static LIBDIVIDE_INLINE __m512i libdivide_s64_branchfree_do_vec512(
+    __m512i numers, const struct libdivide_s64_branchfree_t *denom);
+
+//////// Internal Utility Functions
+
+static LIBDIVIDE_INLINE __m512i libdivide_s64_signbits_vec512(__m512i v) {
+    ;
     return _mm512_srai_epi64(v, 63);
 }
 
-static inline __m512i libdivide_s64_shift_right_vector(__m512i v, int amt) {
+static LIBDIVIDE_INLINE __m512i libdivide_s64_shift_right_vec512(__m512i v, int amt) {
     return _mm512_srai_epi64(v, amt);
 }
 
 // Here, b is assumed to contain one 32-bit value repeated.
-static inline __m512i libdivide_mullhi_u32_vector(__m512i a, __m512i b) {
+static LIBDIVIDE_INLINE __m512i libdivide_mullhi_u32_vec512(__m512i a, __m512i b) {
     __m512i hi_product_0Z2Z = _mm512_srli_epi64(_mm512_mul_epu32(a, b), 32);
     __m512i a1X3X = _mm512_srli_epi64(a, 32);
     __m512i mask = _mm512_set_epi32(-1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0);
@@ -1270,7 +2043,7 @@ static inline __m512i libdivide_mullhi_u32_vector(__m512i a, __m512i b) {
 }
 
 // b is one 32-bit value repeated.
-static inline __m512i libdivide_mullhi_s32_vector(__m512i a, __m512i b) {
+static LIBDIVIDE_INLINE __m512i libdivide_mullhi_s32_vec512(__m512i a, __m512i b) {
     __m512i hi_product_0Z2Z = _mm512_srli_epi64(_mm512_mul_epi32(a, b), 32);
     __m512i a1X3X = _mm512_srli_epi64(a, 32);
     __m512i mask = _mm512_set_epi32(-1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0);
@@ -1279,164 +2052,182 @@ static inline __m512i libdivide_mullhi_s32_vector(__m512i a, __m512i b) {
 }
 
 // Here, y is assumed to contain one 64-bit value repeated.
-// https://stackoverflow.com/a/28827013
-static inline __m512i libdivide_mullhi_u64_vector(__m512i x, __m512i y) {
-    __m512i lomask = _mm512_set1_epi64(0xffffffff);
-    __m512i xh = _mm512_shuffle_epi32(x, (_MM_PERM_ENUM) 0xB1);
-    __m512i yh = _mm512_shuffle_epi32(y, (_MM_PERM_ENUM) 0xB1);
-    __m512i w0 = _mm512_mul_epu32(x, y);
-    __m512i w1 = _mm512_mul_epu32(x, yh);
-    __m512i w2 = _mm512_mul_epu32(xh, y);
-    __m512i w3 = _mm512_mul_epu32(xh, yh);
-    __m512i w0h = _mm512_srli_epi64(w0, 32);
-    __m512i s1 = _mm512_add_epi64(w1, w0h);
-    __m512i s1l = _mm512_and_si512(s1, lomask);
-    __m512i s1h = _mm512_srli_epi64(s1, 32);
-    __m512i s2 = _mm512_add_epi64(w2, s1l);
-    __m512i s2h = _mm512_srli_epi64(s2, 32);
-    __m512i hi = _mm512_add_epi64(w3, s1h);
-            hi = _mm512_add_epi64(hi, s2h);
+static LIBDIVIDE_INLINE __m512i libdivide_mullhi_u64_vec512(__m512i x, __m512i y) {
+    // see m128i variant for comments.
+    __m512i x0y0 = _mm512_mul_epu32(x, y);
+    __m512i x0y0_hi = _mm512_srli_epi64(x0y0, 32);
 
-    return hi;
+    __m512i x1 = _mm512_shuffle_epi32(x, (_MM_PERM_ENUM)_MM_SHUFFLE(3, 3, 1, 1));
+    __m512i y1 = _mm512_shuffle_epi32(y, (_MM_PERM_ENUM)_MM_SHUFFLE(3, 3, 1, 1));
+
+    __m512i x0y1 = _mm512_mul_epu32(x, y1);
+    __m512i x1y0 = _mm512_mul_epu32(x1, y);
+    __m512i x1y1 = _mm512_mul_epu32(x1, y1);
+
+    __m512i mask = _mm512_set1_epi64(0xFFFFFFFF);
+    __m512i temp = _mm512_add_epi64(x1y0, x0y0_hi);
+    __m512i temp_lo = _mm512_and_si512(temp, mask);
+    __m512i temp_hi = _mm512_srli_epi64(temp, 32);
+
+    temp_lo = _mm512_srli_epi64(_mm512_add_epi64(temp_lo, x0y1), 32);
+    temp_hi = _mm512_add_epi64(x1y1, temp_hi);
+    return _mm512_add_epi64(temp_lo, temp_hi);
 }
 
 // y is one 64-bit value repeated.
-static inline __m512i libdivide_mullhi_s64_vector(__m512i x, __m512i y) {
-    __m512i p = libdivide_mullhi_u64_vector(x, y);
-    __m512i t1 = _mm512_and_si512(libdivide_s64_signbits(x), y);
-    __m512i t2 = _mm512_and_si512(libdivide_s64_signbits(y), x);
+static LIBDIVIDE_INLINE __m512i libdivide_mullhi_s64_vec512(__m512i x, __m512i y) {
+    __m512i p = libdivide_mullhi_u64_vec512(x, y);
+    __m512i t1 = _mm512_and_si512(libdivide_s64_signbits_vec512(x), y);
+    __m512i t2 = _mm512_and_si512(libdivide_s64_signbits_vec512(y), x);
     p = _mm512_sub_epi64(p, t1);
     p = _mm512_sub_epi64(p, t2);
     return p;
 }
 
+////////// UINT16
+
+__m512i libdivide_u16_do_vec512(__m512i numers, const struct libdivide_u16_t *denom){
+    SIMPLE_VECTOR_DIVISION(uint16_t, __m512i, u16)}
+
+__m512i libdivide_u16_branchfree_do_vec512(
+    __m512i numers, const struct libdivide_u16_branchfree_t *denom){
+    SIMPLE_VECTOR_DIVISION(uint16_t, __m512i, u16_branchfree)}
+
 ////////// UINT32
 
-__m512i libdivide_u32_do_vector(__m512i numers, const struct libdivide_u32_t *denom) {
+__m512i libdivide_u32_do_vec512(__m512i numers, const struct libdivide_u32_t *denom) {
     uint8_t more = denom->more;
     if (!denom->magic) {
         return _mm512_srli_epi32(numers, more);
-    }
-    else {
-        __m512i q = libdivide_mullhi_u32_vector(numers, _mm512_set1_epi32(denom->magic));
+    } else {
+        __m512i q = libdivide_mullhi_u32_vec512(numers, _mm512_set1_epi32(denom->magic));
         if (more & LIBDIVIDE_ADD_MARKER) {
             // uint32_t t = ((numer - q) >> 1) + q;
             // return t >> denom->shift;
             uint32_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
             __m512i t = _mm512_add_epi32(_mm512_srli_epi32(_mm512_sub_epi32(numers, q), 1), q);
             return _mm512_srli_epi32(t, shift);
-        }
-        else {
+        } else {
             return _mm512_srli_epi32(q, more);
         }
     }
 }
 
-__m512i libdivide_u32_branchfree_do_vector(__m512i numers, const struct libdivide_u32_branchfree_t *denom) {
-    __m512i q = libdivide_mullhi_u32_vector(numers, _mm512_set1_epi32(denom->magic));
+__m512i libdivide_u32_branchfree_do_vec512(
+    __m512i numers, const struct libdivide_u32_branchfree_t *denom) {
+    __m512i q = libdivide_mullhi_u32_vec512(numers, _mm512_set1_epi32(denom->magic));
     __m512i t = _mm512_add_epi32(_mm512_srli_epi32(_mm512_sub_epi32(numers, q), 1), q);
     return _mm512_srli_epi32(t, denom->more);
 }
 
 ////////// UINT64
 
-__m512i libdivide_u64_do_vector(__m512i numers, const struct libdivide_u64_t *denom) {
+__m512i libdivide_u64_do_vec512(__m512i numers, const struct libdivide_u64_t *denom) {
     uint8_t more = denom->more;
     if (!denom->magic) {
         return _mm512_srli_epi64(numers, more);
-    }
-    else {
-        __m512i q = libdivide_mullhi_u64_vector(numers, _mm512_set1_epi64(denom->magic));
+    } else {
+        __m512i q = libdivide_mullhi_u64_vec512(numers, _mm512_set1_epi64(denom->magic));
         if (more & LIBDIVIDE_ADD_MARKER) {
             // uint32_t t = ((numer - q) >> 1) + q;
             // return t >> denom->shift;
             uint32_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
             __m512i t = _mm512_add_epi64(_mm512_srli_epi64(_mm512_sub_epi64(numers, q), 1), q);
             return _mm512_srli_epi64(t, shift);
-        }
-        else {
+        } else {
             return _mm512_srli_epi64(q, more);
         }
     }
 }
 
-__m512i libdivide_u64_branchfree_do_vector(__m512i numers, const struct libdivide_u64_branchfree_t *denom) {
-    __m512i q = libdivide_mullhi_u64_vector(numers, _mm512_set1_epi64(denom->magic));
+__m512i libdivide_u64_branchfree_do_vec512(
+    __m512i numers, const struct libdivide_u64_branchfree_t *denom) {
+    __m512i q = libdivide_mullhi_u64_vec512(numers, _mm512_set1_epi64(denom->magic));
     __m512i t = _mm512_add_epi64(_mm512_srli_epi64(_mm512_sub_epi64(numers, q), 1), q);
     return _mm512_srli_epi64(t, denom->more);
 }
 
+////////// SINT16
+
+__m512i libdivide_s16_do_vec512(__m512i numers, const struct libdivide_s16_t *denom){
+    SIMPLE_VECTOR_DIVISION(int16_t, __m512i, s16)}
+
+__m512i libdivide_s16_branchfree_do_vec512(
+    __m512i numers, const struct libdivide_s16_branchfree_t *denom){
+    SIMPLE_VECTOR_DIVISION(int16_t, __m512i, s16_branchfree)}
+
 ////////// SINT32
 
-__m512i libdivide_s32_do_vector(__m512i numers, const struct libdivide_s32_t *denom) {
+__m512i libdivide_s32_do_vec512(__m512i numers, const struct libdivide_s32_t *denom) {
     uint8_t more = denom->more;
     if (!denom->magic) {
         uint32_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
-        uint32_t mask = (1U << shift) - 1;
+        uint32_t mask = ((uint32_t)1 << shift) - 1;
         __m512i roundToZeroTweak = _mm512_set1_epi32(mask);
         // q = numer + ((numer >> 31) & roundToZeroTweak);
-        __m512i q = _mm512_add_epi32(numers, _mm512_and_si512(_mm512_srai_epi32(numers, 31), roundToZeroTweak));
+        __m512i q = _mm512_add_epi32(
+            numers, _mm512_and_si512(_mm512_srai_epi32(numers, 31), roundToZeroTweak));
         q = _mm512_srai_epi32(q, shift);
         __m512i sign = _mm512_set1_epi32((int8_t)more >> 7);
         // q = (q ^ sign) - sign;
         q = _mm512_sub_epi32(_mm512_xor_si512(q, sign), sign);
         return q;
-    }
-    else {
-        __m512i q = libdivide_mullhi_s32_vector(numers, _mm512_set1_epi32(denom->magic));
+    } else {
+        __m512i q = libdivide_mullhi_s32_vec512(numers, _mm512_set1_epi32(denom->magic));
         if (more & LIBDIVIDE_ADD_MARKER) {
-             // must be arithmetic shift
+            // must be arithmetic shift
             __m512i sign = _mm512_set1_epi32((int8_t)more >> 7);
-             // q += ((numer ^ sign) - sign);
+            // q += ((numer ^ sign) - sign);
             q = _mm512_add_epi32(q, _mm512_sub_epi32(_mm512_xor_si512(numers, sign), sign));
         }
         // q >>= shift
         q = _mm512_srai_epi32(q, more & LIBDIVIDE_32_SHIFT_MASK);
-        q = _mm512_add_epi32(q, _mm512_srli_epi32(q, 31)); // q += (q < 0)
+        q = _mm512_add_epi32(q, _mm512_srli_epi32(q, 31));  // q += (q < 0)
         return q;
     }
 }
 
-__m512i libdivide_s32_branchfree_do_vector(__m512i numers, const struct libdivide_s32_branchfree_t *denom) {
+__m512i libdivide_s32_branchfree_do_vec512(
+    __m512i numers, const struct libdivide_s32_branchfree_t *denom) {
     int32_t magic = denom->magic;
     uint8_t more = denom->more;
     uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
-     // must be arithmetic shift
+    // must be arithmetic shift
     __m512i sign = _mm512_set1_epi32((int8_t)more >> 7);
-    __m512i q = libdivide_mullhi_s32_vector(numers, _mm512_set1_epi32(magic));
-    q = _mm512_add_epi32(q, numers); // q += numers
+    __m512i q = libdivide_mullhi_s32_vec512(numers, _mm512_set1_epi32(magic));
+    q = _mm512_add_epi32(q, numers);  // q += numers
 
     // If q is non-negative, we have nothing to do
     // If q is negative, we want to add either (2**shift)-1 if d is
     // a power of 2, or (2**shift) if it is not a power of 2
     uint32_t is_power_of_2 = (magic == 0);
-    __m512i q_sign = _mm512_srai_epi32(q, 31); // q_sign = q >> 31
-    __m512i mask = _mm512_set1_epi32((1U << shift) - is_power_of_2);
-    q = _mm512_add_epi32(q, _mm512_and_si512(q_sign, mask)); // q = q + (q_sign & mask)
-    q = _mm512_srai_epi32(q, shift); // q >>= shift
-    q = _mm512_sub_epi32(_mm512_xor_si512(q, sign), sign); // q = (q ^ sign) - sign
+    __m512i q_sign = _mm512_srai_epi32(q, 31);  // q_sign = q >> 31
+    __m512i mask = _mm512_set1_epi32(((uint32_t)1 << shift) - is_power_of_2);
+    q = _mm512_add_epi32(q, _mm512_and_si512(q_sign, mask));  // q = q + (q_sign & mask)
+    q = _mm512_srai_epi32(q, shift);                          // q >>= shift
+    q = _mm512_sub_epi32(_mm512_xor_si512(q, sign), sign);    // q = (q ^ sign) - sign
     return q;
 }
 
 ////////// SINT64
 
-__m512i libdivide_s64_do_vector(__m512i numers, const struct libdivide_s64_t *denom) {
+__m512i libdivide_s64_do_vec512(__m512i numers, const struct libdivide_s64_t *denom) {
     uint8_t more = denom->more;
     int64_t magic = denom->magic;
-    if (magic == 0) { // shift path
+    if (magic == 0) {  // shift path
         uint32_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
-        uint64_t mask = (1ULL << shift) - 1;
+        uint64_t mask = ((uint64_t)1 << shift) - 1;
         __m512i roundToZeroTweak = _mm512_set1_epi64(mask);
         // q = numer + ((numer >> 63) & roundToZeroTweak);
-        __m512i q = _mm512_add_epi64(numers, _mm512_and_si512(libdivide_s64_signbits(numers), roundToZeroTweak));
-        q = libdivide_s64_shift_right_vector(q, shift);
+        __m512i q = _mm512_add_epi64(
+            numers, _mm512_and_si512(libdivide_s64_signbits_vec512(numers), roundToZeroTweak));
+        q = libdivide_s64_shift_right_vec512(q, shift);
         __m512i sign = _mm512_set1_epi32((int8_t)more >> 7);
-         // q = (q ^ sign) - sign;
+        // q = (q ^ sign) - sign;
         q = _mm512_sub_epi64(_mm512_xor_si512(q, sign), sign);
         return q;
-    }
-    else {
-        __m512i q = libdivide_mullhi_s64_vector(numers, _mm512_set1_epi64(magic));
+    } else {
+        __m512i q = libdivide_mullhi_s64_vec512(numers, _mm512_set1_epi64(magic));
         if (more & LIBDIVIDE_ADD_MARKER) {
             // must be arithmetic shift
             __m512i sign = _mm512_set1_epi32((int8_t)more >> 7);
@@ -1444,67 +2235,86 @@ __m512i libdivide_s64_do_vector(__m512i numers, const struct libdivide_s64_t *de
             q = _mm512_add_epi64(q, _mm512_sub_epi64(_mm512_xor_si512(numers, sign), sign));
         }
         // q >>= denom->mult_path.shift
-        q = libdivide_s64_shift_right_vector(q, more & LIBDIVIDE_64_SHIFT_MASK);
-        q = _mm512_add_epi64(q, _mm512_srli_epi64(q, 63)); // q += (q < 0)
+        q = libdivide_s64_shift_right_vec512(q, more & LIBDIVIDE_64_SHIFT_MASK);
+        q = _mm512_add_epi64(q, _mm512_srli_epi64(q, 63));  // q += (q < 0)
         return q;
     }
 }
 
-__m512i libdivide_s64_branchfree_do_vector(__m512i numers, const struct libdivide_s64_branchfree_t *denom) {
+__m512i libdivide_s64_branchfree_do_vec512(
+    __m512i numers, const struct libdivide_s64_branchfree_t *denom) {
     int64_t magic = denom->magic;
     uint8_t more = denom->more;
     uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
     // must be arithmetic shift
     __m512i sign = _mm512_set1_epi32((int8_t)more >> 7);
 
-     // libdivide_mullhi_s64(numers, magic);
-    __m512i q = libdivide_mullhi_s64_vector(numers, _mm512_set1_epi64(magic));
-    q = _mm512_add_epi64(q, numers); // q += numers
+    // libdivide_mullhi_s64(numers, magic);
+    __m512i q = libdivide_mullhi_s64_vec512(numers, _mm512_set1_epi64(magic));
+    q = _mm512_add_epi64(q, numers);  // q += numers
 
     // If q is non-negative, we have nothing to do.
     // If q is negative, we want to add either (2**shift)-1 if d is
     // a power of 2, or (2**shift) if it is not a power of 2.
     uint32_t is_power_of_2 = (magic == 0);
-    __m512i q_sign = libdivide_s64_signbits(q); // q_sign = q >> 63
-    __m512i mask = _mm512_set1_epi64((1ULL << shift) - is_power_of_2);
-    q = _mm512_add_epi64(q, _mm512_and_si512(q_sign, mask)); // q = q + (q_sign & mask)
-    q = libdivide_s64_shift_right_vector(q, shift); // q >>= shift
-    q = _mm512_sub_epi64(_mm512_xor_si512(q, sign), sign); // q = (q ^ sign) - sign
+    __m512i q_sign = libdivide_s64_signbits_vec512(q);  // q_sign = q >> 63
+    __m512i mask = _mm512_set1_epi64(((uint64_t)1 << shift) - is_power_of_2);
+    q = _mm512_add_epi64(q, _mm512_and_si512(q_sign, mask));  // q = q + (q_sign & mask)
+    q = libdivide_s64_shift_right_vec512(q, shift);           // q >>= shift
+    q = _mm512_sub_epi64(_mm512_xor_si512(q, sign), sign);    // q = (q ^ sign) - sign
     return q;
 }
 
-#elif defined(LIBDIVIDE_AVX2)
+#endif
 
-static inline __m256i libdivide_u32_do_vector(__m256i numers, const struct libdivide_u32_t *denom);
-static inline __m256i libdivide_s32_do_vector(__m256i numers, const struct libdivide_s32_t *denom);
-static inline __m256i libdivide_u64_do_vector(__m256i numers, const struct libdivide_u64_t *denom);
-static inline __m256i libdivide_s64_do_vector(__m256i numers, const struct libdivide_s64_t *denom);
+#if defined(LIBDIVIDE_AVX2)
 
-static inline __m256i libdivide_u32_branchfree_do_vector(__m256i numers, const struct libdivide_u32_branchfree_t *denom);
-static inline __m256i libdivide_s32_branchfree_do_vector(__m256i numers, const struct libdivide_s32_branchfree_t *denom);
-static inline __m256i libdivide_u64_branchfree_do_vector(__m256i numers, const struct libdivide_u64_branchfree_t *denom);
-static inline __m256i libdivide_s64_branchfree_do_vector(__m256i numers, const struct libdivide_s64_branchfree_t *denom);
+static LIBDIVIDE_INLINE __m256i libdivide_u16_do_vec256(
+    __m256i numers, const struct libdivide_u16_t *denom);
+static LIBDIVIDE_INLINE __m256i libdivide_s16_do_vec256(
+    __m256i numers, const struct libdivide_s16_t *denom);
+static LIBDIVIDE_INLINE __m256i libdivide_u32_do_vec256(
+    __m256i numers, const struct libdivide_u32_t *denom);
+static LIBDIVIDE_INLINE __m256i libdivide_s32_do_vec256(
+    __m256i numers, const struct libdivide_s32_t *denom);
+static LIBDIVIDE_INLINE __m256i libdivide_u64_do_vec256(
+    __m256i numers, const struct libdivide_u64_t *denom);
+static LIBDIVIDE_INLINE __m256i libdivide_s64_do_vec256(
+    __m256i numers, const struct libdivide_s64_t *denom);
+
+static LIBDIVIDE_INLINE __m256i libdivide_u16_branchfree_do_vec256(
+    __m256i numers, const struct libdivide_u16_branchfree_t *denom);
+static LIBDIVIDE_INLINE __m256i libdivide_s16_branchfree_do_vec256(
+    __m256i numers, const struct libdivide_s16_branchfree_t *denom);
+static LIBDIVIDE_INLINE __m256i libdivide_u32_branchfree_do_vec256(
+    __m256i numers, const struct libdivide_u32_branchfree_t *denom);
+static LIBDIVIDE_INLINE __m256i libdivide_s32_branchfree_do_vec256(
+    __m256i numers, const struct libdivide_s32_branchfree_t *denom);
+static LIBDIVIDE_INLINE __m256i libdivide_u64_branchfree_do_vec256(
+    __m256i numers, const struct libdivide_u64_branchfree_t *denom);
+static LIBDIVIDE_INLINE __m256i libdivide_s64_branchfree_do_vec256(
+    __m256i numers, const struct libdivide_s64_branchfree_t *denom);
 
 //////// Internal Utility Functions
 
 // Implementation of _mm256_srai_epi64(v, 63) (from AVX512).
-static inline __m256i libdivide_s64_signbits(__m256i v) {
+static LIBDIVIDE_INLINE __m256i libdivide_s64_signbits_vec256(__m256i v) {
     __m256i hiBitsDuped = _mm256_shuffle_epi32(v, _MM_SHUFFLE(3, 3, 1, 1));
     __m256i signBits = _mm256_srai_epi32(hiBitsDuped, 31);
     return signBits;
 }
 
 // Implementation of _mm256_srai_epi64 (from AVX512).
-static inline __m256i libdivide_s64_shift_right_vector(__m256i v, int amt) {
+static LIBDIVIDE_INLINE __m256i libdivide_s64_shift_right_vec256(__m256i v, int amt) {
     const int b = 64 - amt;
-    __m256i m = _mm256_set1_epi64x(1ULL << (b - 1));
+    __m256i m = _mm256_set1_epi64x((uint64_t)1 << (b - 1));
     __m256i x = _mm256_srli_epi64(v, amt);
     __m256i result = _mm256_sub_epi64(_mm256_xor_si256(x, m), m);
     return result;
 }
 
 // Here, b is assumed to contain one 32-bit value repeated.
-static inline __m256i libdivide_mullhi_u32_vector(__m256i a, __m256i b) {
+static LIBDIVIDE_INLINE __m256i libdivide_mullhi_u32_vec256(__m256i a, __m256i b) {
     __m256i hi_product_0Z2Z = _mm256_srli_epi64(_mm256_mul_epu32(a, b), 32);
     __m256i a1X3X = _mm256_srli_epi64(a, 32);
     __m256i mask = _mm256_set_epi32(-1, 0, -1, 0, -1, 0, -1, 0);
@@ -1513,7 +2323,7 @@ static inline __m256i libdivide_mullhi_u32_vector(__m256i a, __m256i b) {
 }
 
 // b is one 32-bit value repeated.
-static inline __m256i libdivide_mullhi_s32_vector(__m256i a, __m256i b) {
+static LIBDIVIDE_INLINE __m256i libdivide_mullhi_s32_vec256(__m256i a, __m256i b) {
     __m256i hi_product_0Z2Z = _mm256_srli_epi64(_mm256_mul_epi32(a, b), 32);
     __m256i a1X3X = _mm256_srli_epi64(a, 32);
     __m256i mask = _mm256_set_epi32(-1, 0, -1, 0, -1, 0, -1, 0);
@@ -1522,164 +2332,241 @@ static inline __m256i libdivide_mullhi_s32_vector(__m256i a, __m256i b) {
 }
 
 // Here, y is assumed to contain one 64-bit value repeated.
-// https://stackoverflow.com/a/28827013
-static inline __m256i libdivide_mullhi_u64_vector(__m256i x, __m256i y) {
-    __m256i lomask = _mm256_set1_epi64x(0xffffffff);
-    __m256i xh = _mm256_shuffle_epi32(x, 0xB1);        // x0l, x0h, x1l, x1h
-    __m256i yh = _mm256_shuffle_epi32(y, 0xB1);        // y0l, y0h, y1l, y1h
-    __m256i w0 = _mm256_mul_epu32(x, y);               // x0l*y0l, x1l*y1l
-    __m256i w1 = _mm256_mul_epu32(x, yh);              // x0l*y0h, x1l*y1h
-    __m256i w2 = _mm256_mul_epu32(xh, y);              // x0h*y0l, x1h*y0l
-    __m256i w3 = _mm256_mul_epu32(xh, yh);             // x0h*y0h, x1h*y1h
-    __m256i w0h = _mm256_srli_epi64(w0, 32);
-    __m256i s1 = _mm256_add_epi64(w1, w0h);
-    __m256i s1l = _mm256_and_si256(s1, lomask);
-    __m256i s1h = _mm256_srli_epi64(s1, 32);
-    __m256i s2 = _mm256_add_epi64(w2, s1l);
-    __m256i s2h = _mm256_srli_epi64(s2, 32);
-    __m256i hi = _mm256_add_epi64(w3, s1h);
-            hi = _mm256_add_epi64(hi, s2h);
+static LIBDIVIDE_INLINE __m256i libdivide_mullhi_u64_vec256(__m256i x, __m256i y) {
+    // see m128i variant for comments.
+    __m256i x0y0 = _mm256_mul_epu32(x, y);
+    __m256i x0y0_hi = _mm256_srli_epi64(x0y0, 32);
 
-    return hi;
+    __m256i x1 = _mm256_shuffle_epi32(x, _MM_SHUFFLE(3, 3, 1, 1));
+    __m256i y1 = _mm256_shuffle_epi32(y, _MM_SHUFFLE(3, 3, 1, 1));
+
+    __m256i x0y1 = _mm256_mul_epu32(x, y1);
+    __m256i x1y0 = _mm256_mul_epu32(x1, y);
+    __m256i x1y1 = _mm256_mul_epu32(x1, y1);
+
+    __m256i mask = _mm256_set1_epi64x(0xFFFFFFFF);
+    __m256i temp = _mm256_add_epi64(x1y0, x0y0_hi);
+    __m256i temp_lo = _mm256_and_si256(temp, mask);
+    __m256i temp_hi = _mm256_srli_epi64(temp, 32);
+
+    temp_lo = _mm256_srli_epi64(_mm256_add_epi64(temp_lo, x0y1), 32);
+    temp_hi = _mm256_add_epi64(x1y1, temp_hi);
+    return _mm256_add_epi64(temp_lo, temp_hi);
 }
 
 // y is one 64-bit value repeated.
-static inline __m256i libdivide_mullhi_s64_vector(__m256i x, __m256i y) {
-    __m256i p = libdivide_mullhi_u64_vector(x, y);
-    __m256i t1 = _mm256_and_si256(libdivide_s64_signbits(x), y);
-    __m256i t2 = _mm256_and_si256(libdivide_s64_signbits(y), x);
+static LIBDIVIDE_INLINE __m256i libdivide_mullhi_s64_vec256(__m256i x, __m256i y) {
+    __m256i p = libdivide_mullhi_u64_vec256(x, y);
+    __m256i t1 = _mm256_and_si256(libdivide_s64_signbits_vec256(x), y);
+    __m256i t2 = _mm256_and_si256(libdivide_s64_signbits_vec256(y), x);
     p = _mm256_sub_epi64(p, t1);
     p = _mm256_sub_epi64(p, t2);
     return p;
 }
 
+////////// UINT16
+
+__m256i libdivide_u16_do_vec256(__m256i numers, const struct libdivide_u16_t *denom) {
+    uint8_t more = denom->more;
+    if (!denom->magic) {
+        return _mm256_srli_epi16(numers, more);
+    } else {
+        __m256i q = _mm256_mulhi_epu16(numers, _mm256_set1_epi16(denom->magic));
+        if (more & LIBDIVIDE_ADD_MARKER) {
+            __m256i t = _mm256_adds_epu16(_mm256_srli_epi16(_mm256_subs_epu16(numers, q), 1), q);
+            return _mm256_srli_epi16(t, (more & LIBDIVIDE_16_SHIFT_MASK));
+        } else {
+            return _mm256_srli_epi16(q, more);
+        }
+    }
+}
+
+__m256i libdivide_u16_branchfree_do_vec256(
+    __m256i numers, const struct libdivide_u16_branchfree_t *denom) {
+    __m256i q = _mm256_mulhi_epu16(numers, _mm256_set1_epi16(denom->magic));
+    __m256i t = _mm256_adds_epu16(_mm256_srli_epi16(_mm256_subs_epu16(numers, q), 1), q);
+    return _mm256_srli_epi16(t, denom->more);
+}
+
 ////////// UINT32
 
-__m256i libdivide_u32_do_vector(__m256i numers, const struct libdivide_u32_t *denom) {
+__m256i libdivide_u32_do_vec256(__m256i numers, const struct libdivide_u32_t *denom) {
     uint8_t more = denom->more;
     if (!denom->magic) {
         return _mm256_srli_epi32(numers, more);
-    }
-    else {
-        __m256i q = libdivide_mullhi_u32_vector(numers, _mm256_set1_epi32(denom->magic));
+    } else {
+        __m256i q = libdivide_mullhi_u32_vec256(numers, _mm256_set1_epi32(denom->magic));
         if (more & LIBDIVIDE_ADD_MARKER) {
             // uint32_t t = ((numer - q) >> 1) + q;
             // return t >> denom->shift;
             uint32_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
             __m256i t = _mm256_add_epi32(_mm256_srli_epi32(_mm256_sub_epi32(numers, q), 1), q);
             return _mm256_srli_epi32(t, shift);
-        }
-        else {
+        } else {
             return _mm256_srli_epi32(q, more);
         }
     }
 }
 
-__m256i libdivide_u32_branchfree_do_vector(__m256i numers, const struct libdivide_u32_branchfree_t *denom) {
-    __m256i q = libdivide_mullhi_u32_vector(numers, _mm256_set1_epi32(denom->magic));
+__m256i libdivide_u32_branchfree_do_vec256(
+    __m256i numers, const struct libdivide_u32_branchfree_t *denom) {
+    __m256i q = libdivide_mullhi_u32_vec256(numers, _mm256_set1_epi32(denom->magic));
     __m256i t = _mm256_add_epi32(_mm256_srli_epi32(_mm256_sub_epi32(numers, q), 1), q);
     return _mm256_srli_epi32(t, denom->more);
 }
 
 ////////// UINT64
 
-__m256i libdivide_u64_do_vector(__m256i numers, const struct libdivide_u64_t *denom) {
+__m256i libdivide_u64_do_vec256(__m256i numers, const struct libdivide_u64_t *denom) {
     uint8_t more = denom->more;
     if (!denom->magic) {
         return _mm256_srli_epi64(numers, more);
-    }
-    else {
-        __m256i q = libdivide_mullhi_u64_vector(numers, _mm256_set1_epi64x(denom->magic));
+    } else {
+        __m256i q = libdivide_mullhi_u64_vec256(numers, _mm256_set1_epi64x(denom->magic));
         if (more & LIBDIVIDE_ADD_MARKER) {
             // uint32_t t = ((numer - q) >> 1) + q;
             // return t >> denom->shift;
             uint32_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
             __m256i t = _mm256_add_epi64(_mm256_srli_epi64(_mm256_sub_epi64(numers, q), 1), q);
             return _mm256_srli_epi64(t, shift);
-        }
-        else {
+        } else {
             return _mm256_srli_epi64(q, more);
         }
     }
 }
 
-__m256i libdivide_u64_branchfree_do_vector(__m256i numers, const struct libdivide_u64_branchfree_t *denom) {
-    __m256i q = libdivide_mullhi_u64_vector(numers, _mm256_set1_epi64x(denom->magic));
+__m256i libdivide_u64_branchfree_do_vec256(
+    __m256i numers, const struct libdivide_u64_branchfree_t *denom) {
+    __m256i q = libdivide_mullhi_u64_vec256(numers, _mm256_set1_epi64x(denom->magic));
     __m256i t = _mm256_add_epi64(_mm256_srli_epi64(_mm256_sub_epi64(numers, q), 1), q);
     return _mm256_srli_epi64(t, denom->more);
 }
 
+////////// SINT16
+
+__m256i libdivide_s16_do_vec256(__m256i numers, const struct libdivide_s16_t *denom) {
+    uint8_t more = denom->more;
+    if (!denom->magic) {
+        uint16_t shift = more & LIBDIVIDE_16_SHIFT_MASK;
+        uint16_t mask = ((uint16_t)1 << shift) - 1;
+        __m256i roundToZeroTweak = _mm256_set1_epi16(mask);
+        // q = numer + ((numer >> 15) & roundToZeroTweak);
+        __m256i q = _mm256_add_epi16(
+            numers, _mm256_and_si256(_mm256_srai_epi16(numers, 15), roundToZeroTweak));
+        q = _mm256_srai_epi16(q, shift);
+        __m256i sign = _mm256_set1_epi16((int8_t)more >> 7);
+        // q = (q ^ sign) - sign;
+        q = _mm256_sub_epi16(_mm256_xor_si256(q, sign), sign);
+        return q;
+    } else {
+        __m256i q = _mm256_mulhi_epi16(numers, _mm256_set1_epi16(denom->magic));
+        if (more & LIBDIVIDE_ADD_MARKER) {
+            // must be arithmetic shift
+            __m256i sign = _mm256_set1_epi16((int8_t)more >> 7);
+            // q += ((numer ^ sign) - sign);
+            q = _mm256_add_epi16(q, _mm256_sub_epi16(_mm256_xor_si256(numers, sign), sign));
+        }
+        // q >>= shift
+        q = _mm256_srai_epi16(q, more & LIBDIVIDE_16_SHIFT_MASK);
+        q = _mm256_add_epi16(q, _mm256_srli_epi16(q, 15));  // q += (q < 0)
+        return q;
+    }
+}
+
+__m256i libdivide_s16_branchfree_do_vec256(
+    __m256i numers, const struct libdivide_s16_branchfree_t *denom) {
+    int16_t magic = denom->magic;
+    uint8_t more = denom->more;
+    uint8_t shift = more & LIBDIVIDE_16_SHIFT_MASK;
+    // must be arithmetic shift
+    __m256i sign = _mm256_set1_epi16((int8_t)more >> 7);
+    __m256i q = _mm256_mulhi_epi16(numers, _mm256_set1_epi16(magic));
+    q = _mm256_add_epi16(q, numers);  // q += numers
+
+    // If q is non-negative, we have nothing to do
+    // If q is negative, we want to add either (2**shift)-1 if d is
+    // a power of 2, or (2**shift) if it is not a power of 2
+    uint16_t is_power_of_2 = (magic == 0);
+    __m256i q_sign = _mm256_srai_epi16(q, 15);  // q_sign = q >> 15
+    __m256i mask = _mm256_set1_epi16(((uint16_t)1 << shift) - is_power_of_2);
+    q = _mm256_add_epi16(q, _mm256_and_si256(q_sign, mask));  // q = q + (q_sign & mask)
+    q = _mm256_srai_epi16(q, shift);                          // q >>= shift
+    q = _mm256_sub_epi16(_mm256_xor_si256(q, sign), sign);    // q = (q ^ sign) - sign
+    return q;
+}
+
 ////////// SINT32
 
-__m256i libdivide_s32_do_vector(__m256i numers, const struct libdivide_s32_t *denom) {
+__m256i libdivide_s32_do_vec256(__m256i numers, const struct libdivide_s32_t *denom) {
     uint8_t more = denom->more;
     if (!denom->magic) {
         uint32_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
-        uint32_t mask = (1U << shift) - 1;
+        uint32_t mask = ((uint32_t)1 << shift) - 1;
         __m256i roundToZeroTweak = _mm256_set1_epi32(mask);
         // q = numer + ((numer >> 31) & roundToZeroTweak);
-        __m256i q = _mm256_add_epi32(numers, _mm256_and_si256(_mm256_srai_epi32(numers, 31), roundToZeroTweak));
+        __m256i q = _mm256_add_epi32(
+            numers, _mm256_and_si256(_mm256_srai_epi32(numers, 31), roundToZeroTweak));
         q = _mm256_srai_epi32(q, shift);
         __m256i sign = _mm256_set1_epi32((int8_t)more >> 7);
         // q = (q ^ sign) - sign;
         q = _mm256_sub_epi32(_mm256_xor_si256(q, sign), sign);
         return q;
-    }
-    else {
-        __m256i q = libdivide_mullhi_s32_vector(numers, _mm256_set1_epi32(denom->magic));
+    } else {
+        __m256i q = libdivide_mullhi_s32_vec256(numers, _mm256_set1_epi32(denom->magic));
         if (more & LIBDIVIDE_ADD_MARKER) {
-             // must be arithmetic shift
+            // must be arithmetic shift
             __m256i sign = _mm256_set1_epi32((int8_t)more >> 7);
-             // q += ((numer ^ sign) - sign);
+            // q += ((numer ^ sign) - sign);
             q = _mm256_add_epi32(q, _mm256_sub_epi32(_mm256_xor_si256(numers, sign), sign));
         }
         // q >>= shift
         q = _mm256_srai_epi32(q, more & LIBDIVIDE_32_SHIFT_MASK);
-        q = _mm256_add_epi32(q, _mm256_srli_epi32(q, 31)); // q += (q < 0)
+        q = _mm256_add_epi32(q, _mm256_srli_epi32(q, 31));  // q += (q < 0)
         return q;
     }
 }
 
-__m256i libdivide_s32_branchfree_do_vector(__m256i numers, const struct libdivide_s32_branchfree_t *denom) {
+__m256i libdivide_s32_branchfree_do_vec256(
+    __m256i numers, const struct libdivide_s32_branchfree_t *denom) {
     int32_t magic = denom->magic;
     uint8_t more = denom->more;
     uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
-     // must be arithmetic shift
+    // must be arithmetic shift
     __m256i sign = _mm256_set1_epi32((int8_t)more >> 7);
-    __m256i q = libdivide_mullhi_s32_vector(numers, _mm256_set1_epi32(magic));
-    q = _mm256_add_epi32(q, numers); // q += numers
+    __m256i q = libdivide_mullhi_s32_vec256(numers, _mm256_set1_epi32(magic));
+    q = _mm256_add_epi32(q, numers);  // q += numers
 
     // If q is non-negative, we have nothing to do
     // If q is negative, we want to add either (2**shift)-1 if d is
     // a power of 2, or (2**shift) if it is not a power of 2
     uint32_t is_power_of_2 = (magic == 0);
-    __m256i q_sign = _mm256_srai_epi32(q, 31); // q_sign = q >> 31
-    __m256i mask = _mm256_set1_epi32((1U << shift) - is_power_of_2);
-    q = _mm256_add_epi32(q, _mm256_and_si256(q_sign, mask)); // q = q + (q_sign & mask)
-    q = _mm256_srai_epi32(q, shift); // q >>= shift
-    q = _mm256_sub_epi32(_mm256_xor_si256(q, sign), sign); // q = (q ^ sign) - sign
+    __m256i q_sign = _mm256_srai_epi32(q, 31);  // q_sign = q >> 31
+    __m256i mask = _mm256_set1_epi32(((uint32_t)1 << shift) - is_power_of_2);
+    q = _mm256_add_epi32(q, _mm256_and_si256(q_sign, mask));  // q = q + (q_sign & mask)
+    q = _mm256_srai_epi32(q, shift);                          // q >>= shift
+    q = _mm256_sub_epi32(_mm256_xor_si256(q, sign), sign);    // q = (q ^ sign) - sign
     return q;
 }
 
 ////////// SINT64
 
-__m256i libdivide_s64_do_vector(__m256i numers, const struct libdivide_s64_t *denom) {
+__m256i libdivide_s64_do_vec256(__m256i numers, const struct libdivide_s64_t *denom) {
     uint8_t more = denom->more;
     int64_t magic = denom->magic;
-    if (magic == 0) { // shift path
+    if (magic == 0) {  // shift path
         uint32_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
-        uint64_t mask = (1ULL << shift) - 1;
+        uint64_t mask = ((uint64_t)1 << shift) - 1;
         __m256i roundToZeroTweak = _mm256_set1_epi64x(mask);
         // q = numer + ((numer >> 63) & roundToZeroTweak);
-        __m256i q = _mm256_add_epi64(numers, _mm256_and_si256(libdivide_s64_signbits(numers), roundToZeroTweak));
-        q = libdivide_s64_shift_right_vector(q, shift);
+        __m256i q = _mm256_add_epi64(
+            numers, _mm256_and_si256(libdivide_s64_signbits_vec256(numers), roundToZeroTweak));
+        q = libdivide_s64_shift_right_vec256(q, shift);
         __m256i sign = _mm256_set1_epi32((int8_t)more >> 7);
-         // q = (q ^ sign) - sign;
+        // q = (q ^ sign) - sign;
         q = _mm256_sub_epi64(_mm256_xor_si256(q, sign), sign);
         return q;
-    }
-    else {
-        __m256i q = libdivide_mullhi_s64_vector(numers, _mm256_set1_epi64x(magic));
+    } else {
+        __m256i q = libdivide_mullhi_s64_vec256(numers, _mm256_set1_epi64x(magic));
         if (more & LIBDIVIDE_ADD_MARKER) {
             // must be arithmetic shift
             __m256i sign = _mm256_set1_epi32((int8_t)more >> 7);
@@ -1687,67 +2574,86 @@ __m256i libdivide_s64_do_vector(__m256i numers, const struct libdivide_s64_t *de
             q = _mm256_add_epi64(q, _mm256_sub_epi64(_mm256_xor_si256(numers, sign), sign));
         }
         // q >>= denom->mult_path.shift
-        q = libdivide_s64_shift_right_vector(q, more & LIBDIVIDE_64_SHIFT_MASK);
-        q = _mm256_add_epi64(q, _mm256_srli_epi64(q, 63)); // q += (q < 0)
+        q = libdivide_s64_shift_right_vec256(q, more & LIBDIVIDE_64_SHIFT_MASK);
+        q = _mm256_add_epi64(q, _mm256_srli_epi64(q, 63));  // q += (q < 0)
         return q;
     }
 }
 
-__m256i libdivide_s64_branchfree_do_vector(__m256i numers, const struct libdivide_s64_branchfree_t *denom) {
+__m256i libdivide_s64_branchfree_do_vec256(
+    __m256i numers, const struct libdivide_s64_branchfree_t *denom) {
     int64_t magic = denom->magic;
     uint8_t more = denom->more;
     uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
     // must be arithmetic shift
     __m256i sign = _mm256_set1_epi32((int8_t)more >> 7);
 
-     // libdivide_mullhi_s64(numers, magic);
-    __m256i q = libdivide_mullhi_s64_vector(numers, _mm256_set1_epi64x(magic));
-    q = _mm256_add_epi64(q, numers); // q += numers
+    // libdivide_mullhi_s64(numers, magic);
+    __m256i q = libdivide_mullhi_s64_vec256(numers, _mm256_set1_epi64x(magic));
+    q = _mm256_add_epi64(q, numers);  // q += numers
 
     // If q is non-negative, we have nothing to do.
     // If q is negative, we want to add either (2**shift)-1 if d is
     // a power of 2, or (2**shift) if it is not a power of 2.
     uint32_t is_power_of_2 = (magic == 0);
-    __m256i q_sign = libdivide_s64_signbits(q); // q_sign = q >> 63
-    __m256i mask = _mm256_set1_epi64x((1ULL << shift) - is_power_of_2);
-    q = _mm256_add_epi64(q, _mm256_and_si256(q_sign, mask)); // q = q + (q_sign & mask)
-    q = libdivide_s64_shift_right_vector(q, shift); // q >>= shift
-    q = _mm256_sub_epi64(_mm256_xor_si256(q, sign), sign); // q = (q ^ sign) - sign
+    __m256i q_sign = libdivide_s64_signbits_vec256(q);  // q_sign = q >> 63
+    __m256i mask = _mm256_set1_epi64x(((uint64_t)1 << shift) - is_power_of_2);
+    q = _mm256_add_epi64(q, _mm256_and_si256(q_sign, mask));  // q = q + (q_sign & mask)
+    q = libdivide_s64_shift_right_vec256(q, shift);           // q >>= shift
+    q = _mm256_sub_epi64(_mm256_xor_si256(q, sign), sign);    // q = (q ^ sign) - sign
     return q;
 }
 
-#elif defined(LIBDIVIDE_SSE2)
+#endif
 
-static inline __m128i libdivide_u32_do_vector(__m128i numers, const struct libdivide_u32_t *denom);
-static inline __m128i libdivide_s32_do_vector(__m128i numers, const struct libdivide_s32_t *denom);
-static inline __m128i libdivide_u64_do_vector(__m128i numers, const struct libdivide_u64_t *denom);
-static inline __m128i libdivide_s64_do_vector(__m128i numers, const struct libdivide_s64_t *denom);
+#if defined(LIBDIVIDE_SSE2)
 
-static inline __m128i libdivide_u32_branchfree_do_vector(__m128i numers, const struct libdivide_u32_branchfree_t *denom);
-static inline __m128i libdivide_s32_branchfree_do_vector(__m128i numers, const struct libdivide_s32_branchfree_t *denom);
-static inline __m128i libdivide_u64_branchfree_do_vector(__m128i numers, const struct libdivide_u64_branchfree_t *denom);
-static inline __m128i libdivide_s64_branchfree_do_vector(__m128i numers, const struct libdivide_s64_branchfree_t *denom);
+static LIBDIVIDE_INLINE __m128i libdivide_u16_do_vec128(
+    __m128i numers, const struct libdivide_u16_t *denom);
+static LIBDIVIDE_INLINE __m128i libdivide_s16_do_vec128(
+    __m128i numers, const struct libdivide_s16_t *denom);
+static LIBDIVIDE_INLINE __m128i libdivide_u32_do_vec128(
+    __m128i numers, const struct libdivide_u32_t *denom);
+static LIBDIVIDE_INLINE __m128i libdivide_s32_do_vec128(
+    __m128i numers, const struct libdivide_s32_t *denom);
+static LIBDIVIDE_INLINE __m128i libdivide_u64_do_vec128(
+    __m128i numers, const struct libdivide_u64_t *denom);
+static LIBDIVIDE_INLINE __m128i libdivide_s64_do_vec128(
+    __m128i numers, const struct libdivide_s64_t *denom);
+
+static LIBDIVIDE_INLINE __m128i libdivide_u16_branchfree_do_vec128(
+    __m128i numers, const struct libdivide_u16_branchfree_t *denom);
+static LIBDIVIDE_INLINE __m128i libdivide_s16_branchfree_do_vec128(
+    __m128i numers, const struct libdivide_s16_branchfree_t *denom);
+static LIBDIVIDE_INLINE __m128i libdivide_u32_branchfree_do_vec128(
+    __m128i numers, const struct libdivide_u32_branchfree_t *denom);
+static LIBDIVIDE_INLINE __m128i libdivide_s32_branchfree_do_vec128(
+    __m128i numers, const struct libdivide_s32_branchfree_t *denom);
+static LIBDIVIDE_INLINE __m128i libdivide_u64_branchfree_do_vec128(
+    __m128i numers, const struct libdivide_u64_branchfree_t *denom);
+static LIBDIVIDE_INLINE __m128i libdivide_s64_branchfree_do_vec128(
+    __m128i numers, const struct libdivide_s64_branchfree_t *denom);
 
 //////// Internal Utility Functions
 
 // Implementation of _mm_srai_epi64(v, 63) (from AVX512).
-static inline __m128i libdivide_s64_signbits(__m128i v) {
+static LIBDIVIDE_INLINE __m128i libdivide_s64_signbits_vec128(__m128i v) {
     __m128i hiBitsDuped = _mm_shuffle_epi32(v, _MM_SHUFFLE(3, 3, 1, 1));
     __m128i signBits = _mm_srai_epi32(hiBitsDuped, 31);
     return signBits;
 }
 
 // Implementation of _mm_srai_epi64 (from AVX512).
-static inline __m128i libdivide_s64_shift_right_vector(__m128i v, int amt) {
+static LIBDIVIDE_INLINE __m128i libdivide_s64_shift_right_vec128(__m128i v, int amt) {
     const int b = 64 - amt;
-    __m128i m = _mm_set1_epi64x(1ULL << (b - 1));
+    __m128i m = _mm_set1_epi64x((uint64_t)1 << (b - 1));
     __m128i x = _mm_srli_epi64(v, amt);
     __m128i result = _mm_sub_epi64(_mm_xor_si128(x, m), m);
     return result;
 }
 
 // Here, b is assumed to contain one 32-bit value repeated.
-static inline __m128i libdivide_mullhi_u32_vector(__m128i a, __m128i b) {
+static LIBDIVIDE_INLINE __m128i libdivide_mullhi_u32_vec128(__m128i a, __m128i b) {
     __m128i hi_product_0Z2Z = _mm_srli_epi64(_mm_mul_epu32(a, b), 32);
     __m128i a1X3X = _mm_srli_epi64(a, 32);
     __m128i mask = _mm_set_epi32(-1, 0, -1, 0);
@@ -1758,8 +2664,8 @@ static inline __m128i libdivide_mullhi_u32_vector(__m128i a, __m128i b) {
 // SSE2 does not have a signed multiplication instruction, but we can convert
 // unsigned to signed pretty efficiently. Again, b is just a 32 bit value
 // repeated four times.
-static inline __m128i libdivide_mullhi_s32_vector(__m128i a, __m128i b) {
-    __m128i p = libdivide_mullhi_u32_vector(a, b);
+static LIBDIVIDE_INLINE __m128i libdivide_mullhi_s32_vec128(__m128i a, __m128i b) {
+    __m128i p = libdivide_mullhi_u32_vec128(a, b);
     // t1 = (a >> 31) & y, arithmetic shift
     __m128i t1 = _mm_and_si128(_mm_srai_epi32(a, 31), b);
     __m128i t2 = _mm_and_si128(_mm_srai_epi32(b, 31), a);
@@ -1769,164 +2675,251 @@ static inline __m128i libdivide_mullhi_s32_vector(__m128i a, __m128i b) {
 }
 
 // Here, y is assumed to contain one 64-bit value repeated.
-// https://stackoverflow.com/a/28827013
-static inline __m128i libdivide_mullhi_u64_vector(__m128i x, __m128i y) {
-    __m128i lomask = _mm_set1_epi64x(0xffffffff);
-    __m128i xh = _mm_shuffle_epi32(x, 0xB1);        // x0l, x0h, x1l, x1h
-    __m128i yh = _mm_shuffle_epi32(y, 0xB1);        // y0l, y0h, y1l, y1h
-    __m128i w0 = _mm_mul_epu32(x, y);               // x0l*y0l, x1l*y1l
-    __m128i w1 = _mm_mul_epu32(x, yh);              // x0l*y0h, x1l*y1h
-    __m128i w2 = _mm_mul_epu32(xh, y);              // x0h*y0l, x1h*y0l
-    __m128i w3 = _mm_mul_epu32(xh, yh);             // x0h*y0h, x1h*y1h
-    __m128i w0h = _mm_srli_epi64(w0, 32);
-    __m128i s1 = _mm_add_epi64(w1, w0h);
-    __m128i s1l = _mm_and_si128(s1, lomask);
-    __m128i s1h = _mm_srli_epi64(s1, 32);
-    __m128i s2 = _mm_add_epi64(w2, s1l);
-    __m128i s2h = _mm_srli_epi64(s2, 32);
-    __m128i hi = _mm_add_epi64(w3, s1h);
-            hi = _mm_add_epi64(hi, s2h);
+static LIBDIVIDE_INLINE __m128i libdivide_mullhi_u64_vec128(__m128i x, __m128i y) {
+    // full 128 bits product is:
+    // x0*y0 + (x0*y1 << 32) + (x1*y0 << 32) + (x1*y1 << 64)
+    // Note x0,y0,x1,y1 are all conceptually uint32, products are 32x32->64.
 
-    return hi;
+    // Compute x0*y0.
+    // Note x1, y1 are ignored by mul_epu32.
+    __m128i x0y0 = _mm_mul_epu32(x, y);
+    __m128i x0y0_hi = _mm_srli_epi64(x0y0, 32);
+
+    // Get x1, y1 in the low bits.
+    // We could shuffle or right shift. Shuffles are preferred as they preserve
+    // the source register for the next computation.
+    __m128i x1 = _mm_shuffle_epi32(x, _MM_SHUFFLE(3, 3, 1, 1));
+    __m128i y1 = _mm_shuffle_epi32(y, _MM_SHUFFLE(3, 3, 1, 1));
+
+    // No need to mask off top 32 bits for mul_epu32.
+    __m128i x0y1 = _mm_mul_epu32(x, y1);
+    __m128i x1y0 = _mm_mul_epu32(x1, y);
+    __m128i x1y1 = _mm_mul_epu32(x1, y1);
+
+    // Mask here selects low bits only.
+    __m128i mask = _mm_set1_epi64x(0xFFFFFFFF);
+    __m128i temp = _mm_add_epi64(x1y0, x0y0_hi);
+    __m128i temp_lo = _mm_and_si128(temp, mask);
+    __m128i temp_hi = _mm_srli_epi64(temp, 32);
+
+    temp_lo = _mm_srli_epi64(_mm_add_epi64(temp_lo, x0y1), 32);
+    temp_hi = _mm_add_epi64(x1y1, temp_hi);
+    return _mm_add_epi64(temp_lo, temp_hi);
 }
 
 // y is one 64-bit value repeated.
-static inline __m128i libdivide_mullhi_s64_vector(__m128i x, __m128i y) {
-    __m128i p = libdivide_mullhi_u64_vector(x, y);
-    __m128i t1 = _mm_and_si128(libdivide_s64_signbits(x), y);
-    __m128i t2 = _mm_and_si128(libdivide_s64_signbits(y), x);
+static LIBDIVIDE_INLINE __m128i libdivide_mullhi_s64_vec128(__m128i x, __m128i y) {
+    __m128i p = libdivide_mullhi_u64_vec128(x, y);
+    __m128i t1 = _mm_and_si128(libdivide_s64_signbits_vec128(x), y);
+    __m128i t2 = _mm_and_si128(libdivide_s64_signbits_vec128(y), x);
     p = _mm_sub_epi64(p, t1);
     p = _mm_sub_epi64(p, t2);
     return p;
 }
 
+////////// UINT26
+
+__m128i libdivide_u16_do_vec128(__m128i numers, const struct libdivide_u16_t *denom) {
+    uint8_t more = denom->more;
+    if (!denom->magic) {
+        return _mm_srli_epi16(numers, more);
+    } else {
+        __m128i q = _mm_mulhi_epu16(numers, _mm_set1_epi16(denom->magic));
+        if (more & LIBDIVIDE_ADD_MARKER) {
+            __m128i t = _mm_adds_epu16(_mm_srli_epi16(_mm_subs_epu16(numers, q), 1), q);
+            return _mm_srli_epi16(t, (more & LIBDIVIDE_16_SHIFT_MASK));
+        } else {
+            return _mm_srli_epi16(q, more);
+        }
+    }
+}
+
+__m128i libdivide_u16_branchfree_do_vec128(
+    __m128i numers, const struct libdivide_u16_branchfree_t *denom) {
+    __m128i q = _mm_mulhi_epu16(numers, _mm_set1_epi16(denom->magic));
+    __m128i t = _mm_adds_epu16(_mm_srli_epi16(_mm_subs_epu16(numers, q), 1), q);
+    return _mm_srli_epi16(t, denom->more);
+}
+
 ////////// UINT32
 
-__m128i libdivide_u32_do_vector(__m128i numers, const struct libdivide_u32_t *denom) {
+__m128i libdivide_u32_do_vec128(__m128i numers, const struct libdivide_u32_t *denom) {
     uint8_t more = denom->more;
     if (!denom->magic) {
         return _mm_srli_epi32(numers, more);
-    }
-    else {
-        __m128i q = libdivide_mullhi_u32_vector(numers, _mm_set1_epi32(denom->magic));
+    } else {
+        __m128i q = libdivide_mullhi_u32_vec128(numers, _mm_set1_epi32(denom->magic));
         if (more & LIBDIVIDE_ADD_MARKER) {
             // uint32_t t = ((numer - q) >> 1) + q;
             // return t >> denom->shift;
             uint32_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
             __m128i t = _mm_add_epi32(_mm_srli_epi32(_mm_sub_epi32(numers, q), 1), q);
             return _mm_srli_epi32(t, shift);
-        }
-        else {
+        } else {
             return _mm_srli_epi32(q, more);
         }
     }
 }
 
-__m128i libdivide_u32_branchfree_do_vector(__m128i numers, const struct libdivide_u32_branchfree_t *denom) {
-    __m128i q = libdivide_mullhi_u32_vector(numers, _mm_set1_epi32(denom->magic));
+__m128i libdivide_u32_branchfree_do_vec128(
+    __m128i numers, const struct libdivide_u32_branchfree_t *denom) {
+    __m128i q = libdivide_mullhi_u32_vec128(numers, _mm_set1_epi32(denom->magic));
     __m128i t = _mm_add_epi32(_mm_srli_epi32(_mm_sub_epi32(numers, q), 1), q);
     return _mm_srli_epi32(t, denom->more);
 }
 
 ////////// UINT64
 
-__m128i libdivide_u64_do_vector(__m128i numers, const struct libdivide_u64_t *denom) {
+__m128i libdivide_u64_do_vec128(__m128i numers, const struct libdivide_u64_t *denom) {
     uint8_t more = denom->more;
     if (!denom->magic) {
         return _mm_srli_epi64(numers, more);
-    }
-    else {
-        __m128i q = libdivide_mullhi_u64_vector(numers, _mm_set1_epi64x(denom->magic));
+    } else {
+        __m128i q = libdivide_mullhi_u64_vec128(numers, _mm_set1_epi64x(denom->magic));
         if (more & LIBDIVIDE_ADD_MARKER) {
             // uint32_t t = ((numer - q) >> 1) + q;
             // return t >> denom->shift;
             uint32_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
             __m128i t = _mm_add_epi64(_mm_srli_epi64(_mm_sub_epi64(numers, q), 1), q);
             return _mm_srli_epi64(t, shift);
-        }
-        else {
+        } else {
             return _mm_srli_epi64(q, more);
         }
     }
 }
 
-__m128i libdivide_u64_branchfree_do_vector(__m128i numers, const struct libdivide_u64_branchfree_t *denom) {
-    __m128i q = libdivide_mullhi_u64_vector(numers, _mm_set1_epi64x(denom->magic));
+__m128i libdivide_u64_branchfree_do_vec128(
+    __m128i numers, const struct libdivide_u64_branchfree_t *denom) {
+    __m128i q = libdivide_mullhi_u64_vec128(numers, _mm_set1_epi64x(denom->magic));
     __m128i t = _mm_add_epi64(_mm_srli_epi64(_mm_sub_epi64(numers, q), 1), q);
     return _mm_srli_epi64(t, denom->more);
 }
 
+////////// SINT16
+
+__m128i libdivide_s16_do_vec128(__m128i numers, const struct libdivide_s16_t *denom) {
+    uint8_t more = denom->more;
+    if (!denom->magic) {
+        uint16_t shift = more & LIBDIVIDE_16_SHIFT_MASK;
+        uint16_t mask = ((uint16_t)1 << shift) - 1;
+        __m128i roundToZeroTweak = _mm_set1_epi16(mask);
+        // q = numer + ((numer >> 15) & roundToZeroTweak);
+        __m128i q =
+            _mm_add_epi16(numers, _mm_and_si128(_mm_srai_epi16(numers, 15), roundToZeroTweak));
+        q = _mm_srai_epi16(q, shift);
+        __m128i sign = _mm_set1_epi16((int8_t)more >> 7);
+        // q = (q ^ sign) - sign;
+        q = _mm_sub_epi16(_mm_xor_si128(q, sign), sign);
+        return q;
+    } else {
+        __m128i q = _mm_mulhi_epi16(numers, _mm_set1_epi16(denom->magic));
+        if (more & LIBDIVIDE_ADD_MARKER) {
+            // must be arithmetic shift
+            __m128i sign = _mm_set1_epi16((int8_t)more >> 7);
+            // q += ((numer ^ sign) - sign);
+            q = _mm_add_epi16(q, _mm_sub_epi16(_mm_xor_si128(numers, sign), sign));
+        }
+        // q >>= shift
+        q = _mm_srai_epi16(q, more & LIBDIVIDE_16_SHIFT_MASK);
+        q = _mm_add_epi16(q, _mm_srli_epi16(q, 15));  // q += (q < 0)
+        return q;
+    }
+}
+
+__m128i libdivide_s16_branchfree_do_vec128(
+    __m128i numers, const struct libdivide_s16_branchfree_t *denom) {
+    int16_t magic = denom->magic;
+    uint8_t more = denom->more;
+    uint8_t shift = more & LIBDIVIDE_16_SHIFT_MASK;
+    // must be arithmetic shift
+    __m128i sign = _mm_set1_epi16((int8_t)more >> 7);
+    __m128i q = _mm_mulhi_epi16(numers, _mm_set1_epi16(magic));
+    q = _mm_add_epi16(q, numers);  // q += numers
+
+    // If q is non-negative, we have nothing to do
+    // If q is negative, we want to add either (2**shift)-1 if d is
+    // a power of 2, or (2**shift) if it is not a power of 2
+    uint16_t is_power_of_2 = (magic == 0);
+    __m128i q_sign = _mm_srai_epi16(q, 15);  // q_sign = q >> 15
+    __m128i mask = _mm_set1_epi16(((uint16_t)1 << shift) - is_power_of_2);
+    q = _mm_add_epi16(q, _mm_and_si128(q_sign, mask));  // q = q + (q_sign & mask)
+    q = _mm_srai_epi16(q, shift);                       // q >>= shift
+    q = _mm_sub_epi16(_mm_xor_si128(q, sign), sign);    // q = (q ^ sign) - sign
+    return q;
+}
+
 ////////// SINT32
 
-__m128i libdivide_s32_do_vector(__m128i numers, const struct libdivide_s32_t *denom) {
+__m128i libdivide_s32_do_vec128(__m128i numers, const struct libdivide_s32_t *denom) {
     uint8_t more = denom->more;
     if (!denom->magic) {
         uint32_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
-        uint32_t mask = (1U << shift) - 1;
+        uint32_t mask = ((uint32_t)1 << shift) - 1;
         __m128i roundToZeroTweak = _mm_set1_epi32(mask);
         // q = numer + ((numer >> 31) & roundToZeroTweak);
-        __m128i q = _mm_add_epi32(numers, _mm_and_si128(_mm_srai_epi32(numers, 31), roundToZeroTweak));
+        __m128i q =
+            _mm_add_epi32(numers, _mm_and_si128(_mm_srai_epi32(numers, 31), roundToZeroTweak));
         q = _mm_srai_epi32(q, shift);
         __m128i sign = _mm_set1_epi32((int8_t)more >> 7);
         // q = (q ^ sign) - sign;
         q = _mm_sub_epi32(_mm_xor_si128(q, sign), sign);
         return q;
-    }
-    else {
-        __m128i q = libdivide_mullhi_s32_vector(numers, _mm_set1_epi32(denom->magic));
+    } else {
+        __m128i q = libdivide_mullhi_s32_vec128(numers, _mm_set1_epi32(denom->magic));
         if (more & LIBDIVIDE_ADD_MARKER) {
-             // must be arithmetic shift
+            // must be arithmetic shift
             __m128i sign = _mm_set1_epi32((int8_t)more >> 7);
-             // q += ((numer ^ sign) - sign);
+            // q += ((numer ^ sign) - sign);
             q = _mm_add_epi32(q, _mm_sub_epi32(_mm_xor_si128(numers, sign), sign));
         }
         // q >>= shift
         q = _mm_srai_epi32(q, more & LIBDIVIDE_32_SHIFT_MASK);
-        q = _mm_add_epi32(q, _mm_srli_epi32(q, 31)); // q += (q < 0)
+        q = _mm_add_epi32(q, _mm_srli_epi32(q, 31));  // q += (q < 0)
         return q;
     }
 }
 
-__m128i libdivide_s32_branchfree_do_vector(__m128i numers, const struct libdivide_s32_branchfree_t *denom) {
+__m128i libdivide_s32_branchfree_do_vec128(
+    __m128i numers, const struct libdivide_s32_branchfree_t *denom) {
     int32_t magic = denom->magic;
     uint8_t more = denom->more;
     uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
-     // must be arithmetic shift
+    // must be arithmetic shift
     __m128i sign = _mm_set1_epi32((int8_t)more >> 7);
-    __m128i q = libdivide_mullhi_s32_vector(numers, _mm_set1_epi32(magic));
-    q = _mm_add_epi32(q, numers); // q += numers
+    __m128i q = libdivide_mullhi_s32_vec128(numers, _mm_set1_epi32(magic));
+    q = _mm_add_epi32(q, numers);  // q += numers
 
     // If q is non-negative, we have nothing to do
     // If q is negative, we want to add either (2**shift)-1 if d is
     // a power of 2, or (2**shift) if it is not a power of 2
     uint32_t is_power_of_2 = (magic == 0);
-    __m128i q_sign = _mm_srai_epi32(q, 31); // q_sign = q >> 31
-    __m128i mask = _mm_set1_epi32((1U << shift) - is_power_of_2);
-    q = _mm_add_epi32(q, _mm_and_si128(q_sign, mask)); // q = q + (q_sign & mask)
-    q = _mm_srai_epi32(q, shift); // q >>= shift
-    q = _mm_sub_epi32(_mm_xor_si128(q, sign), sign); // q = (q ^ sign) - sign
+    __m128i q_sign = _mm_srai_epi32(q, 31);  // q_sign = q >> 31
+    __m128i mask = _mm_set1_epi32(((uint32_t)1 << shift) - is_power_of_2);
+    q = _mm_add_epi32(q, _mm_and_si128(q_sign, mask));  // q = q + (q_sign & mask)
+    q = _mm_srai_epi32(q, shift);                       // q >>= shift
+    q = _mm_sub_epi32(_mm_xor_si128(q, sign), sign);    // q = (q ^ sign) - sign
     return q;
 }
 
 ////////// SINT64
 
-__m128i libdivide_s64_do_vector(__m128i numers, const struct libdivide_s64_t *denom) {
+__m128i libdivide_s64_do_vec128(__m128i numers, const struct libdivide_s64_t *denom) {
     uint8_t more = denom->more;
     int64_t magic = denom->magic;
-    if (magic == 0) { // shift path
+    if (magic == 0) {  // shift path
         uint32_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
-        uint64_t mask = (1ULL << shift) - 1;
+        uint64_t mask = ((uint64_t)1 << shift) - 1;
         __m128i roundToZeroTweak = _mm_set1_epi64x(mask);
         // q = numer + ((numer >> 63) & roundToZeroTweak);
-        __m128i q = _mm_add_epi64(numers, _mm_and_si128(libdivide_s64_signbits(numers), roundToZeroTweak));
-        q = libdivide_s64_shift_right_vector(q, shift);
+        __m128i q = _mm_add_epi64(
+            numers, _mm_and_si128(libdivide_s64_signbits_vec128(numers), roundToZeroTweak));
+        q = libdivide_s64_shift_right_vec128(q, shift);
         __m128i sign = _mm_set1_epi32((int8_t)more >> 7);
-         // q = (q ^ sign) - sign;
+        // q = (q ^ sign) - sign;
         q = _mm_sub_epi64(_mm_xor_si128(q, sign), sign);
         return q;
-    }
-    else {
-        __m128i q = libdivide_mullhi_s64_vector(numers, _mm_set1_epi64x(magic));
+    } else {
+        __m128i q = libdivide_mullhi_s64_vec128(numers, _mm_set1_epi64x(magic));
         if (more & LIBDIVIDE_ADD_MARKER) {
             // must be arithmetic shift
             __m128i sign = _mm_set1_epi32((int8_t)more >> 7);
@@ -1934,32 +2927,33 @@ __m128i libdivide_s64_do_vector(__m128i numers, const struct libdivide_s64_t *de
             q = _mm_add_epi64(q, _mm_sub_epi64(_mm_xor_si128(numers, sign), sign));
         }
         // q >>= denom->mult_path.shift
-        q = libdivide_s64_shift_right_vector(q, more & LIBDIVIDE_64_SHIFT_MASK);
-        q = _mm_add_epi64(q, _mm_srli_epi64(q, 63)); // q += (q < 0)
+        q = libdivide_s64_shift_right_vec128(q, more & LIBDIVIDE_64_SHIFT_MASK);
+        q = _mm_add_epi64(q, _mm_srli_epi64(q, 63));  // q += (q < 0)
         return q;
     }
 }
 
-__m128i libdivide_s64_branchfree_do_vector(__m128i numers, const struct libdivide_s64_branchfree_t *denom) {
+__m128i libdivide_s64_branchfree_do_vec128(
+    __m128i numers, const struct libdivide_s64_branchfree_t *denom) {
     int64_t magic = denom->magic;
     uint8_t more = denom->more;
     uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
     // must be arithmetic shift
     __m128i sign = _mm_set1_epi32((int8_t)more >> 7);
 
-     // libdivide_mullhi_s64(numers, magic);
-    __m128i q = libdivide_mullhi_s64_vector(numers, _mm_set1_epi64x(magic));
-    q = _mm_add_epi64(q, numers); // q += numers
+    // libdivide_mullhi_s64(numers, magic);
+    __m128i q = libdivide_mullhi_s64_vec128(numers, _mm_set1_epi64x(magic));
+    q = _mm_add_epi64(q, numers);  // q += numers
 
     // If q is non-negative, we have nothing to do.
     // If q is negative, we want to add either (2**shift)-1 if d is
     // a power of 2, or (2**shift) if it is not a power of 2.
     uint32_t is_power_of_2 = (magic == 0);
-    __m128i q_sign = libdivide_s64_signbits(q); // q_sign = q >> 63
-    __m128i mask = _mm_set1_epi64x((1ULL << shift) - is_power_of_2);
-    q = _mm_add_epi64(q, _mm_and_si128(q_sign, mask)); // q = q + (q_sign & mask)
-    q = libdivide_s64_shift_right_vector(q, shift); // q >>= shift
-    q = _mm_sub_epi64(_mm_xor_si128(q, sign), sign); // q = (q ^ sign) - sign
+    __m128i q_sign = libdivide_s64_signbits_vec128(q);  // q_sign = q >> 63
+    __m128i mask = _mm_set1_epi64x(((uint64_t)1 << shift) - is_power_of_2);
+    q = _mm_add_epi64(q, _mm_and_si128(q_sign, mask));  // q = q + (q_sign & mask)
+    q = libdivide_s64_shift_right_vec128(q, shift);     // q >>= shift
+    q = _mm_sub_epi64(_mm_xor_si128(q, sign), sign);    // q = (q ^ sign) - sign
     return q;
 }
 
@@ -1969,143 +2963,307 @@ __m128i libdivide_s64_branchfree_do_vector(__m128i numers, const struct libdivid
 
 #ifdef __cplusplus
 
-// The C++ divider class is templated on both an integer type
-// (like uint64_t) and an algorithm type.
-// * BRANCHFULL is the default algorithm type.
-// * BRANCHFREE is the branchfree algorithm type.
-enum {
-    BRANCHFULL,
-    BRANCHFREE
+enum Branching {
+    BRANCHFULL,  // use branching algorithms
+    BRANCHFREE   // use branchfree algorithms
 };
 
-#if defined(LIBDIVIDE_AVX512)
-    #define LIBDIVIDE_VECTOR_TYPE __m512i
-#elif defined(LIBDIVIDE_AVX2)
-    #define LIBDIVIDE_VECTOR_TYPE __m256i
-#elif defined(LIBDIVIDE_SSE2)
-    #define LIBDIVIDE_VECTOR_TYPE __m128i
+namespace detail {
+enum Signedness {
+    SIGNED,
+    UNSIGNED,
+};
+
+#if defined(LIBDIVIDE_NEON)
+// Helper to deduce NEON vector type for integral type.
+template <int _WIDTH, Signedness _SIGN>
+struct NeonVec {};
+
+template <>
+struct NeonVec<16, UNSIGNED> {
+    typedef uint16x8_t type;
+};
+
+template <>
+struct NeonVec<16, SIGNED> {
+    typedef int16x8_t type;
+};
+
+template <>
+struct NeonVec<32, UNSIGNED> {
+    typedef uint32x4_t type;
+};
+
+template <>
+struct NeonVec<32, SIGNED> {
+    typedef int32x4_t type;
+};
+
+template <>
+struct NeonVec<64, UNSIGNED> {
+    typedef uint64x2_t type;
+};
+
+template <>
+struct NeonVec<64, SIGNED> {
+    typedef int64x2_t type;
+};
+
+template <typename T>
+struct NeonVecFor {
+    // See 'class divider' for an explanation of these template parameters.
+    typedef typename NeonVec<sizeof(T) * 8, (((T)0 >> 0) > (T)(-1) ? SIGNED : UNSIGNED)>::type type;
+};
+
+#define LIBDIVIDE_DIVIDE_NEON(ALGO, INT_TYPE)                    \
+    LIBDIVIDE_INLINE typename NeonVecFor<INT_TYPE>::type divide( \
+        typename NeonVecFor<INT_TYPE>::type n) const {           \
+        return libdivide_##ALGO##_do_vec128(n, &denom);          \
+    }
+#else
+#define LIBDIVIDE_DIVIDE_NEON(ALGO, INT_TYPE)
 #endif
 
-#if !defined(LIBDIVIDE_VECTOR_TYPE)
-    #define LIBDIVIDE_DIVIDE_VECTOR(ALGO)
+#if defined(LIBDIVIDE_SSE2)
+#define LIBDIVIDE_DIVIDE_SSE2(ALGO)                     \
+    LIBDIVIDE_INLINE __m128i divide(__m128i n) const {  \
+        return libdivide_##ALGO##_do_vec128(n, &denom); \
+    }
 #else
-    #define LIBDIVIDE_DIVIDE_VECTOR(ALGO) \
-        LIBDIVIDE_VECTOR_TYPE divide(LIBDIVIDE_VECTOR_TYPE n) const { \
-            return libdivide_##ALGO##_do_vector(n, &denom); \
-        }
+#define LIBDIVIDE_DIVIDE_SSE2(ALGO)
+#endif
+
+#if defined(LIBDIVIDE_AVX2)
+#define LIBDIVIDE_DIVIDE_AVX2(ALGO)                     \
+    LIBDIVIDE_INLINE __m256i divide(__m256i n) const {  \
+        return libdivide_##ALGO##_do_vec256(n, &denom); \
+    }
+#else
+#define LIBDIVIDE_DIVIDE_AVX2(ALGO)
+#endif
+
+#if defined(LIBDIVIDE_AVX512)
+#define LIBDIVIDE_DIVIDE_AVX512(ALGO)                   \
+    LIBDIVIDE_INLINE __m512i divide(__m512i n) const {  \
+        return libdivide_##ALGO##_do_vec512(n, &denom); \
+    }
+#else
+#define LIBDIVIDE_DIVIDE_AVX512(ALGO)
 #endif
 
 // The DISPATCHER_GEN() macro generates C++ methods (for the given integer
 // and algorithm types) that redirect to libdivide's C API.
-#define DISPATCHER_GEN(T, ALGO) \
-    libdivide_##ALGO##_t denom; \
-    dispatcher() { } \
-    dispatcher(T d) \
-        : denom(libdivide_##ALGO##_gen(d)) \
-    { } \
-    T divide(T n) const { \
-        return libdivide_##ALGO##_do(n, &denom); \
-    } \
-    LIBDIVIDE_DIVIDE_VECTOR(ALGO) \
-    T recover() const { \
-        return libdivide_##ALGO##_recover(&denom); \
-    }
+#define DISPATCHER_GEN(T, ALGO)                                                       \
+    libdivide_##ALGO##_t denom;                                                       \
+    LIBDIVIDE_INLINE dispatcher() {}                                                  \
+    LIBDIVIDE_INLINE dispatcher(T d) : denom(libdivide_##ALGO##_gen(d)) {}            \
+    LIBDIVIDE_INLINE T divide(T n) const { return libdivide_##ALGO##_do(n, &denom); } \
+    LIBDIVIDE_INLINE T recover() const { return libdivide_##ALGO##_recover(&denom); } \
+    LIBDIVIDE_DIVIDE_NEON(ALGO, T)                                                    \
+    LIBDIVIDE_DIVIDE_SSE2(ALGO)                                                       \
+    LIBDIVIDE_DIVIDE_AVX2(ALGO)                                                       \
+    LIBDIVIDE_DIVIDE_AVX512(ALGO)
 
 // The dispatcher selects a specific division algorithm for a given
-// type and ALGO using partial template specialization.
-template<bool IS_INTEGRAL, bool IS_SIGNED, int SIZEOF, int ALGO> struct dispatcher { };
+// width, signedness, and ALGO using partial template specialization.
+template <int _WIDTH, Signedness _SIGN, Branching _ALGO>
+struct dispatcher {};
 
-template<> struct dispatcher<true, true, sizeof(int32_t), BRANCHFULL> { DISPATCHER_GEN(int32_t, s32) };
-template<> struct dispatcher<true, true, sizeof(int32_t), BRANCHFREE> { DISPATCHER_GEN(int32_t, s32_branchfree) };
-template<> struct dispatcher<true, false, sizeof(uint32_t), BRANCHFULL> { DISPATCHER_GEN(uint32_t, u32) };
-template<> struct dispatcher<true, false, sizeof(uint32_t), BRANCHFREE> { DISPATCHER_GEN(uint32_t, u32_branchfree) };
-template<> struct dispatcher<true, true, sizeof(int64_t), BRANCHFULL> { DISPATCHER_GEN(int64_t, s64) };
-template<> struct dispatcher<true, true, sizeof(int64_t), BRANCHFREE> { DISPATCHER_GEN(int64_t, s64_branchfree) };
-template<> struct dispatcher<true, false, sizeof(uint64_t), BRANCHFULL> { DISPATCHER_GEN(uint64_t, u64) };
-template<> struct dispatcher<true, false, sizeof(uint64_t), BRANCHFREE> { DISPATCHER_GEN(uint64_t, u64_branchfree) };
+template <>
+struct dispatcher<16, SIGNED, BRANCHFULL> {
+    DISPATCHER_GEN(int16_t, s16)
+};
+template <>
+struct dispatcher<16, SIGNED, BRANCHFREE> {
+    DISPATCHER_GEN(int16_t, s16_branchfree)
+};
+template <>
+struct dispatcher<16, UNSIGNED, BRANCHFULL> {
+    DISPATCHER_GEN(uint16_t, u16)
+};
+template <>
+struct dispatcher<16, UNSIGNED, BRANCHFREE> {
+    DISPATCHER_GEN(uint16_t, u16_branchfree)
+};
+template <>
+struct dispatcher<32, SIGNED, BRANCHFULL> {
+    DISPATCHER_GEN(int32_t, s32)
+};
+template <>
+struct dispatcher<32, SIGNED, BRANCHFREE> {
+    DISPATCHER_GEN(int32_t, s32_branchfree)
+};
+template <>
+struct dispatcher<32, UNSIGNED, BRANCHFULL> {
+    DISPATCHER_GEN(uint32_t, u32)
+};
+template <>
+struct dispatcher<32, UNSIGNED, BRANCHFREE> {
+    DISPATCHER_GEN(uint32_t, u32_branchfree)
+};
+template <>
+struct dispatcher<64, SIGNED, BRANCHFULL> {
+    DISPATCHER_GEN(int64_t, s64)
+};
+template <>
+struct dispatcher<64, SIGNED, BRANCHFREE> {
+    DISPATCHER_GEN(int64_t, s64_branchfree)
+};
+template <>
+struct dispatcher<64, UNSIGNED, BRANCHFULL> {
+    DISPATCHER_GEN(uint64_t, u64)
+};
+template <>
+struct dispatcher<64, UNSIGNED, BRANCHFREE> {
+    DISPATCHER_GEN(uint64_t, u64_branchfree)
+};
+}  // namespace detail
+
+#if defined(LIBDIVIDE_NEON)
+// Allow NeonVecFor outside of detail namespace.
+template <typename T>
+struct NeonVecFor {
+    typedef typename detail::NeonVecFor<T>::type type;
+};
+#endif
 
 // This is the main divider class for use by the user (C++ API).
 // The actual division algorithm is selected using the dispatcher struct
-// based on the integer and algorithm template parameters.
-template<typename T, int ALGO = BRANCHFULL>
+// based on the integer width and algorithm template parameters.
+template <typename T, Branching ALGO = BRANCHFULL>
 class divider {
-public:
+   private:
+    // Dispatch based on the size and signedness.
+    // We avoid using type_traits as it's not available in AVR.
+    // Detect signedness by checking if T(-1) is less than T(0).
+    // Also throw in a shift by 0, which prevents floating point types from being passed.
+    typedef detail::dispatcher<sizeof(T) * 8,
+        (((T)0 >> 0) > (T)(-1) ? detail::SIGNED : detail::UNSIGNED), ALGO>
+        dispatcher_t;
+
+   public:
     // We leave the default constructor empty so that creating
     // an array of dividers and then initializing them
     // later doesn't slow us down.
-    divider() { }
+    divider() {}
 
     // Constructor that takes the divisor as a parameter
-    divider(T d) : div(d) { }
+    LIBDIVIDE_INLINE divider(T d) : div(d) {}
 
     // Divides n by the divisor
-    T divide(T n) const {
-        return div.divide(n);
-    }
+    LIBDIVIDE_INLINE T divide(T n) const { return div.divide(n); }
 
     // Recovers the divisor, returns the value that was
     // used to initialize this divider object.
-    T recover() const {
-        return div.recover();
+    T recover() const { return div.recover(); }
+
+    bool operator==(const divider<T, ALGO> &other) const {
+        return div.denom.magic == other.denom.magic && div.denom.more == other.denom.more;
     }
 
-    bool operator==(const divider<T, ALGO>& other) const {
-        return div.denom.magic == other.denom.magic &&
-               div.denom.more == other.denom.more;
-    }
+    bool operator!=(const divider<T, ALGO> &other) const { return !(*this == other); }
 
-    bool operator!=(const divider<T, ALGO>& other) const {
-        return !(*this == other);
-    }
-
-#if defined(LIBDIVIDE_VECTOR_TYPE)
-    // Treats the vector as packed integer values with the same type as
-    // the divider (e.g. s32, u32, s64, u64) and divides each of
-    // them by the divider, returning the packed quotients.
-    LIBDIVIDE_VECTOR_TYPE divide(LIBDIVIDE_VECTOR_TYPE n) const {
+    // Vector variants treat the input as packed integer values with the same type as the divider
+    // (e.g. s32, u32, s64, u64) and divides each of them by the divider, returning the packed
+    // quotients.
+#if defined(LIBDIVIDE_SSE2)
+    LIBDIVIDE_INLINE __m128i divide(__m128i n) const { return div.divide(n); }
+#endif
+#if defined(LIBDIVIDE_AVX2)
+    LIBDIVIDE_INLINE __m256i divide(__m256i n) const { return div.divide(n); }
+#endif
+#if defined(LIBDIVIDE_AVX512)
+    LIBDIVIDE_INLINE __m512i divide(__m512i n) const { return div.divide(n); }
+#endif
+#if defined(LIBDIVIDE_NEON)
+    LIBDIVIDE_INLINE typename NeonVecFor<T>::type divide(typename NeonVecFor<T>::type n) const {
         return div.divide(n);
     }
 #endif
 
-private:
+   private:
     // Storage for the actual divisor
-    dispatcher<std::is_integral<T>::value,
-               std::is_signed<T>::value, sizeof(T), ALGO> div;
+    dispatcher_t div;
 };
 
 // Overload of operator / for scalar division
-template<typename T, int ALGO>
-T operator/(T n, const divider<T, ALGO>& div) {
+template <typename T, Branching ALGO>
+LIBDIVIDE_INLINE T operator/(T n, const divider<T, ALGO> &div) {
     return div.divide(n);
 }
 
 // Overload of operator /= for scalar division
-template<typename T, int ALGO>
-T& operator/=(T& n, const divider<T, ALGO>& div) {
+template <typename T, Branching ALGO>
+LIBDIVIDE_INLINE T &operator/=(T &n, const divider<T, ALGO> &div) {
     n = div.divide(n);
     return n;
 }
 
-#if defined(LIBDIVIDE_VECTOR_TYPE)
-    // Overload of operator / for vector division
-    template<typename T, int ALGO>
-    LIBDIVIDE_VECTOR_TYPE operator/(LIBDIVIDE_VECTOR_TYPE n, const divider<T, ALGO>& div) {
-        return div.divide(n);
-    }
-    // Overload of operator /= for vector division
-    template<typename T, int ALGO>
-    LIBDIVIDE_VECTOR_TYPE& operator/=(LIBDIVIDE_VECTOR_TYPE& n, const divider<T, ALGO>& div) {
-        n = div.divide(n);
-        return n;
-    }
+// Overloads for vector types.
+#if defined(LIBDIVIDE_SSE2)
+template <typename T, Branching ALGO>
+LIBDIVIDE_INLINE __m128i operator/(__m128i n, const divider<T, ALGO> &div) {
+    return div.divide(n);
+}
+
+template <typename T, Branching ALGO>
+LIBDIVIDE_INLINE __m128i operator/=(__m128i &n, const divider<T, ALGO> &div) {
+    n = div.divide(n);
+    return n;
+}
+#endif
+#if defined(LIBDIVIDE_AVX2)
+template <typename T, Branching ALGO>
+LIBDIVIDE_INLINE __m256i operator/(__m256i n, const divider<T, ALGO> &div) {
+    return div.divide(n);
+}
+
+template <typename T, Branching ALGO>
+LIBDIVIDE_INLINE __m256i operator/=(__m256i &n, const divider<T, ALGO> &div) {
+    n = div.divide(n);
+    return n;
+}
+#endif
+#if defined(LIBDIVIDE_AVX512)
+template <typename T, Branching ALGO>
+LIBDIVIDE_INLINE __m512i operator/(__m512i n, const divider<T, ALGO> &div) {
+    return div.divide(n);
+}
+
+template <typename T, Branching ALGO>
+LIBDIVIDE_INLINE __m512i operator/=(__m512i &n, const divider<T, ALGO> &div) {
+    n = div.divide(n);
+    return n;
+}
 #endif
 
-// libdivdie::branchfree_divider<T>
+#if defined(LIBDIVIDE_NEON)
+template <typename T, Branching ALGO>
+LIBDIVIDE_INLINE typename NeonVecFor<T>::type operator/(
+    typename NeonVecFor<T>::type n, const divider<T, ALGO> &div) {
+    return div.divide(n);
+}
+
+template <typename T, Branching ALGO>
+LIBDIVIDE_INLINE typename NeonVecFor<T>::type operator/=(
+    typename NeonVecFor<T>::type &n, const divider<T, ALGO> &div) {
+    n = div.divide(n);
+    return n;
+}
+#endif
+
+#if __cplusplus >= 201103L || (defined(_MSC_VER) && _MSC_VER >= 1900)
+// libdivide::branchfree_divider<T>
 template <typename T>
 using branchfree_divider = divider<T, BRANCHFREE>;
+#endif
 
-} // namespace libdivide
+}  // namespace libdivide
 
-#endif // __cplusplus
+#endif  // __cplusplus
 
-#endif // LIBDIVIDE_H
+#if defined(_MSC_VER)
+#pragma warning(pop)
+#endif
+
+#endif  // LIBDIVIDE_H
diff --git a/src/r_draw.c b/src/r_draw.cpp
similarity index 92%
rename from src/r_draw.c
rename to src/r_draw.cpp
index d07323752..24b3c539f 100644
--- a/src/r_draw.c
+++ b/src/r_draw.cpp
@@ -8,7 +8,7 @@
 // terms of the GNU General Public License, version 2.
 // See the 'LICENSE' file for more details.
 //-----------------------------------------------------------------------------
-/// \file  r_draw.c
+/// \file  r_draw.cpp
 /// \brief span / column drawer functions, for 8bpp and 16bpp
 ///        All drawing to the view buffer is accomplished in this file.
 ///        The other refresh files only know about ccordinates,
@@ -33,24 +33,24 @@
 #include "hardware/hw_main.h"
 #endif
 
+#include <tracy/tracy/Tracy.hpp>
 
 // --------------------------------------------
 // assembly or c drawer routines for 8bpp/16bpp
 // --------------------------------------------
 coldrawfunc_t *colfunc;
+
 coldrawfunc_t *colfuncs[COLDRAWFUNC_MAX];
-#ifdef USE_COL_SPAN_ASM
-coldrawfunc_t *colfuncs_asm[COLDRAWFUNC_MAX];
-#endif
+coldrawfunc_t *colfuncs_bm[COLDRAWFUNC_MAX];
+
 int colfunctype;
 
 spandrawfunc_t *spanfunc;
 
 spandrawfunc_t *spanfuncs[SPANDRAWFUNC_MAX];
+spandrawfunc_t *spanfuncs_bm[SPANDRAWFUNC_MAX];
 spandrawfunc_t *spanfuncs_npo2[SPANDRAWFUNC_MAX];
-#ifdef USE_COL_SPAN_ASM
-spandrawfunc_t *spanfuncs_asm[SPANDRAWFUNC_MAX];
-#endif
+spandrawfunc_t *spanfuncs_bm_npo2[SPANDRAWFUNC_MAX];
 spandrawfunc_t *spanfuncs_flat[SPANDRAWFUNC_MAX];
 
 drawcolumndata_t g_dc;
@@ -212,17 +212,17 @@ static void R_AllocateBlendTables(void)
 	{
 		if (i == blendtab_modulate)
 			continue;
-		blendtables[i] = Z_MallocAlign((NUMTRANSTABLES + 1) * 0x10000, PU_STATIC, NULL, 16);
+		blendtables[i] = static_cast<UINT8 *>(Z_MallocAlign((NUMTRANSTABLES + 1) * 0x10000, PU_STATIC, NULL, 16));
 	}
 
 	// Modulation blending only requires a single table
-	blendtables[blendtab_modulate] = Z_MallocAlign(0x10000, PU_STATIC, NULL, 16);
+	blendtables[blendtab_modulate] = static_cast<UINT8 *>(Z_MallocAlign(0x10000, PU_STATIC, NULL, 16));
 }
 
 #ifdef HAVE_THREADS
 static void R_GenerateBlendTables_Thread(void *userdata)
 {
-	struct GenerateBlendTables_State *state = userdata;
+	struct GenerateBlendTables_State *state = static_cast<struct GenerateBlendTables_State *>(userdata);
 
 	R_GenerateBlendTables_Core(state);
 
@@ -239,8 +239,7 @@ void R_InitTranslucencyTables(void)
 	// Load here the transparency lookup tables 'TINTTAB'
 	// NOTE: the TINTTAB resource MUST BE aligned on 64k for the asm
 	// optimised code (in other words, transtables pointer low word is 0)
-	transtables = Z_MallocAlign(NUMTRANSTABLES*0x10000, PU_STATIC,
-		NULL, 16);
+	transtables = static_cast<UINT8 *>(Z_MallocAlign(NUMTRANSTABLES*0x10000, PU_STATIC, NULL, 16));
 
 	W_ReadLump(W_GetNumForName("TRANS10"), transtables);
 	W_ReadLump(W_GetNumForName("TRANS20"), transtables+0x10000);
@@ -260,11 +259,11 @@ void R_GenerateBlendTables(void)
 {
 #ifdef HAVE_THREADS
 	// Allocate copies for the worker thread since the originals can be freed in the main thread.
-	struct GenerateBlendTables_State *state = malloc(sizeof *state);
+	struct GenerateBlendTables_State *state = static_cast<struct GenerateBlendTables_State *>(malloc(sizeof *state));
 	size_t palsize = 256 * sizeof(RGBA_t);
 
-	state->masterPalette = memcpy(malloc(palsize), pMasterPalette, palsize);
-	state->gammaCorrectedPalette = memcpy(malloc(palsize), pGammaCorrectedPalette, palsize);
+	state->masterPalette = static_cast<RGBA_t *>(memcpy(malloc(palsize), pMasterPalette, palsize));
+	state->gammaCorrectedPalette = static_cast<RGBA_t *>(memcpy(malloc(palsize), pGammaCorrectedPalette, palsize));
 
 	I_spawn_thread("blend-tables",
 			R_GenerateBlendTables_Thread, state);
@@ -313,7 +312,7 @@ void R_GenerateTranslucencyTable(UINT8 *table, RGBA_t* sourcepal, int style, UIN
 	}
 }
 
-#define ClipTransLevel(trans) max(min((trans), NUMTRANSMAPS-2), 0)
+#define ClipTransLevel(trans) std::clamp<INT32>(trans, 0, NUMTRANSMAPS-2)
 
 UINT8 *R_GetTranslucencyTable(INT32 alphalevel)
 {
@@ -364,7 +363,7 @@ UINT8* R_GetTranslationColormap(INT32 skinnum, skincolornum_t color, UINT8 flags
 	{
 		// Allocate table for skin if necessary
 		if (!translationtablecache[skintableindex])
-			translationtablecache[skintableindex] = Z_Calloc(MAXSKINCOLORS * sizeof(UINT8**), PU_STATIC, NULL);
+			translationtablecache[skintableindex] = static_cast<UINT8 **>(Z_Calloc(MAXSKINCOLORS * sizeof(UINT8**), PU_STATIC, NULL));
 
 		// Get colormap
 		ret = translationtablecache[skintableindex][color];
@@ -383,7 +382,7 @@ UINT8* R_GetTranslationColormap(INT32 skinnum, skincolornum_t color, UINT8 flags
 	// Generate the colormap if necessary
 	if (!ret)
 	{
-		ret = Z_MallocAlign(NUM_PALETTE_ENTRIES, (flags & GTC_CACHE) ? PU_LEVEL : PU_STATIC, NULL, 8);
+		ret = static_cast<UINT8 *>(Z_MallocAlign(NUM_PALETTE_ENTRIES, (flags & GTC_CACHE) ? PU_LEVEL : PU_STATIC, NULL, 8));
 		K_GenerateKartColormap(ret, skinnum, color); //R_GenerateTranslationColormap(ret, skinnum, color); // SRB2kart
 
 		// Cache the colormap if desired
@@ -425,7 +424,7 @@ UINT16 R_GetColorByName(const char *name)
 UINT16 R_GetSuperColorByName(const char *name)
 {
 	UINT16 i, color = SKINCOLOR_NONE;
-	char *realname = Z_Malloc(MAXCOLORNAME+1, PU_STATIC, NULL);
+	char *realname = static_cast<char *>(Z_Malloc(MAXCOLORNAME+1, PU_STATIC, NULL));
 	snprintf(realname, MAXCOLORNAME+1, "Super %s 1", name);
 	for (i = 1; i < numskincolors; i++)
 		if (!stricmp(skincolors[i].name, realname)) {
@@ -655,17 +654,8 @@ void R_DrawViewBorder(void)
 #endif
 
 // ==========================================================================
-//                   INCLUDE 8bpp DRAWING CODE HERE
+//                   INCLUDE MAIN DRAWERS CODE HERE
 // ==========================================================================
 
-#include "r_draw8.c"
-#include "r_draw8_npo2.c"
-#include "r_draw8_flat.c"
-
-// ==========================================================================
-//                   INCLUDE 16bpp DRAWING CODE HERE
-// ==========================================================================
-
-#ifdef HIGHCOLOR
-#include "r_draw16.c"
-#endif
+#include "r_draw_column.cpp"
+#include "r_draw_span.cpp"
diff --git a/src/r_draw.h b/src/r_draw.h
index 3cc1381e8..643cc6403 100644
--- a/src/r_draw.h
+++ b/src/r_draw.h
@@ -64,22 +64,20 @@ extern float zeroheight;
 
 extern lumpnum_t viewborderlump[8];
 
-
-
 // ---------------------------------------------
 // color mode dependent drawer function pointers
 // ---------------------------------------------
 
-#define USE_COL_SPAN_ASM 0
-
 #define BASEDRAWFUNC 0
 
+typedef void (coldrawfunc_t)(drawcolumndata_t*);
+typedef void (spandrawfunc_t)(drawspandata_t*);
+
 enum
 {
 	COLDRAWFUNC_BASE = BASEDRAWFUNC,
 	COLDRAWFUNC_FUZZY,
 	COLDRAWFUNC_TRANS,
-	COLDRAWFUNC_SHADE,
 	COLDRAWFUNC_SHADOWED,
 	COLDRAWFUNC_TRANSTRANS,
 	COLDRAWFUNC_TWOSMULTIPATCH,
@@ -90,15 +88,11 @@ enum
 	COLDRAWFUNC_MAX
 };
 
-typedef void (coldrawfunc_t)(drawcolumndata_t*);
-typedef void (spandrawfunc_t)(drawspandata_t*);
-
-extern coldrawfunc_t *colfunc;
-extern coldrawfunc_t *colfuncs[COLDRAWFUNC_MAX];
-#ifdef USE_COL_SPAN_ASM
-extern coldrawfunc_t *colfuncs_asm[COLDRAWFUNC_MAX];
-#endif
 extern int colfunctype;
+extern coldrawfunc_t *colfunc;
+
+extern coldrawfunc_t *colfuncs[COLDRAWFUNC_MAX];
+extern coldrawfunc_t *colfuncs_bm[COLDRAWFUNC_MAX];
 
 enum
 {
@@ -120,16 +114,17 @@ enum
 	SPANDRAWFUNC_TILTEDWATER,
 
 	SPANDRAWFUNC_FOG,
+	SPANDRAWFUNC_TILTEDFOG,
 
 	SPANDRAWFUNC_MAX
 };
 
 extern spandrawfunc_t *spanfunc;
+
 extern spandrawfunc_t *spanfuncs[SPANDRAWFUNC_MAX];
+extern spandrawfunc_t *spanfuncs_bm[SPANDRAWFUNC_MAX];
 extern spandrawfunc_t *spanfuncs_npo2[SPANDRAWFUNC_MAX];
-#ifdef USE_COL_SPAN_ASM
-extern spandrawfunc_t *spanfuncs_asm[SPANDRAWFUNC_MAX];
-#endif
+extern spandrawfunc_t *spanfuncs_bm_npo2[SPANDRAWFUNC_MAX];
 extern spandrawfunc_t *spanfuncs_flat[SPANDRAWFUNC_MAX];
 
 // ------------------------------------------------
@@ -202,90 +197,98 @@ void R_DrawViewBorder(void);
 // 8bpp DRAWING CODE
 // -----------------
 
-void R_DrawColumn_8(drawcolumndata_t* dc);
-void R_DrawShadeColumn_8(drawcolumndata_t* dc);
-void R_DrawTranslucentColumn_8(drawcolumndata_t* dc);
-void R_DrawDropShadowColumn_8(drawcolumndata_t* dc);
-void R_DrawTranslatedColumn_8(drawcolumndata_t* dc);
-void R_DrawTranslatedTranslucentColumn_8(drawcolumndata_t* dc);
-void R_Draw2sMultiPatchColumn_8(drawcolumndata_t* dc);
-void R_Draw2sMultiPatchTranslucentColumn_8(drawcolumndata_t* dc);
-void R_DrawFogColumn_8(drawcolumndata_t* dc);
-void R_DrawColumnShadowed_8(drawcolumndata_t* dc);
+void R_DrawColumn(drawcolumndata_t* dc);
+void R_DrawTranslucentColumn(drawcolumndata_t* dc);
+void R_DrawDropShadowColumn(drawcolumndata_t* dc);
+void R_DrawTranslatedColumn(drawcolumndata_t* dc);
+void R_DrawTranslatedTranslucentColumn(drawcolumndata_t* dc);
+void R_Draw2sMultiPatchColumn(drawcolumndata_t* dc);
+void R_Draw2sMultiPatchTranslucentColumn(drawcolumndata_t* dc);
+void R_DrawFogColumn(drawcolumndata_t* dc);
+void R_DrawColumnShadowed(drawcolumndata_t* dc);
 
-#define PLANELIGHTFLOAT (BASEVIDWIDTH * BASEVIDWIDTH / vid.width / ds->zeroheight / 21.0f * FIXED_TO_FLOAT(fovtan[viewssnum]))
+void R_DrawColumn_Brightmap(drawcolumndata_t* dc);
+void R_DrawTranslucentColumn_Brightmap(drawcolumndata_t* dc);
+void R_DrawTranslatedColumn_Brightmap(drawcolumndata_t* dc);
+void R_DrawTranslatedTranslucentColumn_Brightmap(drawcolumndata_t* dc);
+void R_Draw2sMultiPatchColumn_Brightmap(drawcolumndata_t* dc);
+void R_Draw2sMultiPatchTranslucentColumn_Brightmap(drawcolumndata_t* dc);
+void R_DrawColumnShadowed_Brightmap(drawcolumndata_t* dc);
 
-void R_DrawSpan_8(drawspandata_t* ds);
-void R_DrawTranslucentSpan_8(drawspandata_t* ds);
-void R_DrawTiltedSpan_8(drawspandata_t* ds);
-void R_DrawTiltedTranslucentSpan_8(drawspandata_t* ds);
+void R_DrawSpan(drawspandata_t* ds);
+void R_DrawTranslucentSpan(drawspandata_t* ds);
+void R_DrawSplat(drawspandata_t* ds);
+void R_DrawTranslucentSplat(drawspandata_t* ds);
+void R_DrawFloorSprite(drawspandata_t* ds);
+void R_DrawTranslucentFloorSprite(drawspandata_t* ds);
+void R_DrawTranslucentWaterSpan(drawspandata_t* ds);
+void R_DrawFogSpan(drawspandata_t* ds);
 
-void R_DrawSplat_8(drawspandata_t* ds);
-void R_DrawTranslucentSplat_8(drawspandata_t* ds);
-void R_DrawTiltedSplat_8(drawspandata_t* ds);
+void R_DrawSpan_Tilted(drawspandata_t* ds);
+void R_DrawTranslucentSpan_Tilted(drawspandata_t* ds);
+void R_DrawSplat_Tilted(drawspandata_t* ds);
+void R_DrawTranslucentSplat_Tilted(drawspandata_t* ds);
+void R_DrawFloorSprite_Tilted(drawspandata_t* ds);
+void R_DrawTranslucentFloorSprite_Tilted(drawspandata_t* ds);
+void R_DrawTranslucentWaterSpan_Tilted(drawspandata_t* ds);
+void R_DrawFogSpan_Tilted(drawspandata_t* ds);
 
-void R_DrawFloorSprite_8(drawspandata_t* ds);
-void R_DrawTranslucentFloorSprite_8(drawspandata_t* ds);
-void R_DrawTiltedFloorSprite_8(drawspandata_t* ds);
-void R_DrawTiltedTranslucentFloorSprite_8(drawspandata_t* ds);
+void R_DrawSpan_NPO2(drawspandata_t* ds);
+void R_DrawTranslucentSpan_NPO2(drawspandata_t* ds);
+void R_DrawSplat_NPO2(drawspandata_t* ds);
+void R_DrawTranslucentSplat_NPO2(drawspandata_t* ds);
+void R_DrawFloorSprite_NPO2(drawspandata_t* ds);
+void R_DrawTranslucentFloorSprite_NPO2(drawspandata_t* ds);
+void R_DrawTranslucentWaterSpan_NPO2(drawspandata_t* ds);
 
-void R_CalcTiltedLighting(INT32 *lightbuffer, INT32 x1, INT32 x2, fixed_t start, fixed_t end);
+void R_DrawSpan_Tilted_NPO2(drawspandata_t* ds);
+void R_DrawTranslucentSpan_Tilted_NPO2(drawspandata_t* ds);
+void R_DrawSplat_Tilted_NPO2(drawspandata_t* ds);
+void R_DrawTranslucentSplat_Tilted_NPO2(drawspandata_t* ds);
+void R_DrawFloorSprite_Tilted_NPO2(drawspandata_t* ds);
+void R_DrawTranslucentFloorSprite_Tilted_NPO2(drawspandata_t* ds);
+void R_DrawTranslucentWaterSpan_Tilted_NPO2(drawspandata_t* ds);
 
-void R_DrawTranslucentWaterSpan_8(drawspandata_t* ds);
-void R_DrawTiltedTranslucentWaterSpan_8(drawspandata_t* ds);
+void R_DrawSpan_Brightmap(drawspandata_t* ds);
+void R_DrawTranslucentSpan_Brightmap(drawspandata_t* ds);
+void R_DrawSplat_Brightmap(drawspandata_t* ds);
+void R_DrawTranslucentSplat_Brightmap(drawspandata_t* ds);
+void R_DrawFloorSprite_Brightmap(drawspandata_t* ds);
+void R_DrawTranslucentFloorSprite_Brightmap(drawspandata_t* ds);
+void R_DrawTranslucentWaterSpan_Brightmap(drawspandata_t* ds);
 
-void R_DrawFogSpan_8(drawspandata_t* ds);
+void R_DrawSpan_Tilted_Brightmap(drawspandata_t* ds);
+void R_DrawTranslucentSpan_Tilted_Brightmap(drawspandata_t* ds);
+void R_DrawSplat_Tilted_Brightmap(drawspandata_t* ds);
+void R_DrawTranslucentSplat_Tilted_Brightmap(drawspandata_t* ds);
+void R_DrawFloorSprite_Tilted_Brightmap(drawspandata_t* ds);
+void R_DrawTranslucentFloorSprite_Tilted_Brightmap(drawspandata_t* ds);
+void R_DrawTranslucentWaterSpan_Tilted_Brightmap(drawspandata_t* ds);
 
-// Lactozilla: Non-powers-of-two
-void R_DrawSpan_NPO2_8(drawspandata_t* ds);
-void R_DrawTranslucentSpan_NPO2_8(drawspandata_t* ds);
-void R_DrawTiltedSpan_NPO2_8(drawspandata_t* ds);
-void R_DrawTiltedTranslucentSpan_NPO2_8(drawspandata_t* ds);
+void R_DrawSpan_Brightmap_NPO2(drawspandata_t* ds);
+void R_DrawTranslucentSpan_Brightmap_NPO2(drawspandata_t* ds);
+void R_DrawSplat_Brightmap_NPO2(drawspandata_t* ds);
+void R_DrawTranslucentSplat_Brightmap_NPO2(drawspandata_t* ds);
+void R_DrawFloorSprite_Brightmap_NPO2(drawspandata_t* ds);
+void R_DrawTranslucentFloorSprite_Brightmap_NPO2(drawspandata_t* ds);
+void R_DrawTranslucentWaterSpan_Brightmap_NPO2(drawspandata_t* ds);
 
-void R_DrawSplat_NPO2_8(drawspandata_t* ds);
-void R_DrawTranslucentSplat_NPO2_8(drawspandata_t* ds);
-void R_DrawTiltedSplat_NPO2_8(drawspandata_t* ds);
-
-void R_DrawFloorSprite_NPO2_8(drawspandata_t* ds);
-void R_DrawTranslucentFloorSprite_NPO2_8(drawspandata_t* ds);
-void R_DrawTiltedFloorSprite_NPO2_8(drawspandata_t* ds);
-void R_DrawTiltedTranslucentFloorSprite_NPO2_8(drawspandata_t* ds);
-
-void R_DrawTranslucentWaterSpan_NPO2_8(drawspandata_t* ds);
-void R_DrawTiltedTranslucentWaterSpan_NPO2_8(drawspandata_t* ds);
+void R_DrawSpan_Tilted_Brightmap_NPO2(drawspandata_t* ds);
+void R_DrawTranslucentSpan_Tilted_Brightmap_NPO2(drawspandata_t* ds);
+void R_DrawSplat_Tilted_Brightmap_NPO2(drawspandata_t* ds);
+void R_DrawTranslucentSplat_Tilted_Brightmap_NPO2(drawspandata_t* ds);
+void R_DrawFloorSprite_Tilted_Brightmap_NPO2(drawspandata_t* ds);
+void R_DrawTranslucentFloorSprite_Tilted_Brightmap_NPO2(drawspandata_t* ds);
+void R_DrawTranslucentWaterSpan_Tilted_Brightmap_NPO2(drawspandata_t* ds);
 
 // Debugging - highlight surfaces in flat colors
-void R_DrawColumn_Flat_8(drawcolumndata_t* dc);
-void R_DrawSpan_Flat_8(drawspandata_t* ds);
-void R_DrawTiltedSpan_Flat_8(drawspandata_t* ds);
-
-#ifdef USEASM
-void ASMCALL R_DrawColumn_8_ASM(void);
-void ASMCALL R_DrawShadeColumn_8_ASM(void);
-void ASMCALL R_DrawTranslucentColumn_8_ASM(void);
-void ASMCALL R_Draw2sMultiPatchColumn_8_ASM(void);
-
-void ASMCALL R_DrawColumn_8_MMX(void);
-
-void ASMCALL R_Draw2sMultiPatchColumn_8_MMX(void);
-void ASMCALL R_DrawSpan_8_MMX(void);
-#endif
-
-// ------------------
-// 16bpp DRAWING CODE
-// ------------------
-
-#ifdef HIGHCOLOR
-void R_DrawColumn_16(void);
-void R_DrawWallColumn_16(void);
-void R_DrawTranslucentColumn_16(void);
-void R_DrawTranslatedColumn_16(void);
-void R_DrawSpan_16(void);
-#endif
+void R_DrawColumn_Flat(drawcolumndata_t* dc);
+void R_DrawSpan_Flat(drawspandata_t* ds);
+void R_DrawTiltedSpan_Flat(drawspandata_t* ds);
 
 #ifdef __cplusplus
 } // extern "C"
-#endif
+#endif // __cplusplus
 
 // =========================================================================
 #endif  // __R_DRAW__
diff --git a/src/r_draw16.c b/src/r_draw16.c
deleted file mode 100644
index 8b1d29e8d..000000000
--- a/src/r_draw16.c
+++ /dev/null
@@ -1,214 +0,0 @@
-// SONIC ROBO BLAST 2
-//-----------------------------------------------------------------------------
-// Copyright (C) 1998-2000 by DooM Legacy Team.
-// Copyright (C) 1999-2020 by Sonic Team Junior.
-//
-// This program is free software distributed under the
-// terms of the GNU General Public License, version 2.
-// See the 'LICENSE' file for more details.
-//-----------------------------------------------------------------------------
-/// \file  r_draw16.c
-/// \brief 16bpp (HIGHCOLOR) span/column drawer functions
-/// \note  no includes because this is included as part of r_draw.c
-
-// ==========================================================================
-// COLUMNS
-// ==========================================================================
-
-/// \brief kick out the upper bit of each component (we're in 5 : 5 : 5)
-#define HIMASK1 0x7bde
-
-/**	\brief The R_DrawColumn_16 function
-	standard upto 128high posts column drawer
-*/
-void R_DrawColumn_16(void)
-{
-	INT32 count;
-	INT16 *dest;
-	fixed_t frac, fracstep;
-
-	count = dc_yh - dc_yl + 1;
-
-	// Zero length, column does not exceed a pixel.
-	if (count <= 0)
-		return;
-
-#ifdef RANGECHECK
-	if (dc_x >= vid.width || dc_yl < 0 || dc_yh >= vid.height)
-		I_Error("R_DrawColumn_16: %d to %d at %d", dc_yl, dc_yh, dc_x);
-#endif
-
-	// Framebuffer destination address.
-	// Use ylookup LUT to avoid multiply with ScreenWidth.
-	// Use columnofs LUT for subwindows?
-	dest = (INT16 *)(void *)(ylookup[dc_yl] + columnofs[dc_x]);
-
-	// Determine scaling, which is the only mapping to be done.
-	fracstep = dc_iscale;
-	frac = dc_texturemid + (dc_yl - centery)*fracstep;
-
-	// Inner loop that does the actual texture mapping, e.g. a DDA-like scaling.
-	// This is as fast as it gets.
-
-	do
-	{
-		// Re-map color indices from wall texture column using a lighting/special effects LUT.
-		*dest = hicolormaps[((INT16 *)(void *)dc_source)[(frac>>FRACBITS)&127]>>1];
-
-		dest += vid.width;
-		frac += fracstep;
-	} while (--count);
-}
-
-/**	\brief The R_DrawWallColumn_16 function
-	LAME cutnpaste: same as R_DrawColumn_16 but wraps around 256
-	instead of 128 for the tall sky textures (256x240)
-*/
-void R_DrawWallColumn_16(void)
-{
-	INT32 count;
-	INT16 *dest;
-	fixed_t frac, fracstep;
-
-	count = dc_yh - dc_yl + 1;
-
-	// Zero length, column does not exceed a pixel.
-	if (count <= 0)
-		return;
-
-#ifdef RANGECHECK
-	if (dc_x >= vid.width || dc_yl < 0 || dc_yh >= vid.height)
-		I_Error("R_DrawWallColumn_16: %d to %d at %d", dc_yl, dc_yh, dc_x);
-#endif
-
-	dest = (INT16 *)(void *)(ylookup[dc_yl] + columnofs[dc_x]);
-
-	fracstep = dc_iscale;
-	frac = dc_texturemid + (dc_yl - centery)*fracstep;
-
-	do
-	{
-		*dest = hicolormaps[((INT16 *)(void *)dc_source)[(frac>>FRACBITS)&255]>>1];
-
-		dest += vid.width;
-		frac += fracstep;
-	} while (--count);
-}
-
-/**	\brief The R_DrawTranslucentColumn_16 function
-		LAME cutnpaste: same as R_DrawColumn_16 but does
-		translucent
-*/
-void R_DrawTranslucentColumn_16(void)
-{
-	INT32 count;
-	INT16 *dest;
-	fixed_t frac, fracstep;
-
-	// check out coords for src*
-	if ((dc_yl < 0) || (dc_x >= vid.width))
-		return;
-
-	count = dc_yh - dc_yl;
-	if (count < 0)
-		return;
-
-#ifdef RANGECHECK
-	if (dc_x >= vid.width || dc_yl < 0 || dc_yh >= vid.height)
-		I_Error("R_DrawTranslucentColumn_16: %d to %d at %d", dc_yl, dc_yh, dc_x);
-#endif
-
-	// FIXME. As above.
-	dest = (INT16 *)(void *)(ylookup[dc_yl] + columnofs[dc_x]);
-
-	// Looks familiar.
-	fracstep = dc_iscale;
-	frac = dc_texturemid + (dc_yl - centery)*fracstep;
-
-	// Here we do an additional index re-mapping.
-	do
-	{
-		*dest = (INT16)((INT16)((color8to16[dc_source[frac>>FRACBITS]]>>1) & 0x39ce)
-			+ (INT16)(((*dest & HIMASK1)) & 0x7fff));
-
-		dest += vid.width;
-		frac += fracstep;
-	} while (count--);
-}
-
-/**	\brief The R_DrawTranslatedColumn_16 function
-	?
-*/
-void R_DrawTranslatedColumn_16(void)
-{
-	INT32 count;
-	INT16 *dest;
-	fixed_t frac, fracstep;
-
-	count = dc_yh - dc_yl;
-	if (count < 0)
-		return;
-
-#ifdef RANGECHECK
-	if (dc_x >= vid.width || dc_yl < 0 || dc_yh >= vid.height)
-		I_Error("R_DrawTranslatedColumn_16: %d to %d at %d", dc_yl, dc_yh, dc_x);
-#endif
-
-	dest = (INT16 *)(void *)(ylookup[dc_yl] + columnofs[dc_x]);
-
-	// Looks familiar.
-	fracstep = dc_iscale;
-	frac = dc_texturemid + (dc_yl - centery)*fracstep;
-
-	// Here we do an additional index re-mapping.
-	do
-	{
-		*dest = color8to16[dc_colormap[dc_translation[dc_source[frac>>FRACBITS]]]];
-		dest += vid.width;
-
-		frac += fracstep;
-	} while (count--);
-}
-
-// ==========================================================================
-// SPANS
-// ==========================================================================
-
-/**	\brief The R_*_16 function
-	Draws the actual span.
-*/
-void R_DrawSpan_16(void)
-{
-	fixed_t xfrac, yfrac;
-	INT16 *dest;
-	INT32 count, spot;
-
-#ifdef RANGECHECK
-	if (ds_x2 < ds_x1 || ds_x1 < 0 || ds_x2 >= vid.width || ds_y > vid.height)
-		I_Error("R_DrawSpan_16: %d to %d at %d", ds_x1, ds_x2, ds_y);
-#endif
-
-	xfrac = ds_xfrac;
-	yfrac = ds_yfrac;
-
-	dest = (INT16 *)(void *)(ylookup[ds_y] + columnofs[ds_x1]);
-
-	// We do not check for zero spans here?
-	count = ds_x2 - ds_x1;
-
-	if (count <= 0) // We do now!
-		return;
-
-	do
-	{
-		// Current texture index in u, v.
-		spot = ((yfrac>>(16-6))&(63*64)) + ((xfrac>>16)&63);
-
-		// Lookup pixel from flat texture tile, re-index using light/colormap.
-		*dest++ = hicolormaps[((INT16 *)(void *)ds_source)[spot]>>1];
-
-		// Next step in u, v.
-		xfrac += ds_xstep;
-		yfrac += ds_ystep;
-	} while (count--);
-}
diff --git a/src/r_draw8.c b/src/r_draw8.c
deleted file mode 100644
index 8840106e9..000000000
--- a/src/r_draw8.c
+++ /dev/null
@@ -1,2564 +0,0 @@
-// SONIC ROBO BLAST 2
-//-----------------------------------------------------------------------------
-// Copyright (C) 1998-2000 by DooM Legacy Team.
-// Copyright (C) 1999-2021 by Sonic Team Junior.
-//
-// This program is free software distributed under the
-// terms of the GNU General Public License, version 2.
-// See the 'LICENSE' file for more details.
-//-----------------------------------------------------------------------------
-/// \file  r_draw8.c
-/// \brief 8bpp span/column drawer functions
-/// \note  no includes because this is included as part of r_draw.c
-
-#include <tracy/tracy/TracyC.h>
-
-// ==========================================================================
-// COLUMNS
-// ==========================================================================
-
-// A column is a vertical slice/span of a wall texture that uses
-// a has a constant z depth from top to bottom.
-//
-
-/**	\brief The R_DrawColumn_8 function
-	Experiment to make software go faster. Taken from the Boom source
-*/
-void R_DrawColumn_8(drawcolumndata_t* dc)
-{
-	INT32 count;
-	register UINT8 *dest;
-	register fixed_t frac;
-	fixed_t fracstep;
-
-	count = dc->yh - dc->yl;
-
-	if (count < 0) // Zero length, column does not exceed a pixel.
-		return;
-
-#ifdef RANGECHECK
-	if ((unsigned)dc->x >= (unsigned)vid.width || dc->yl < 0 || dc->yh >= vid.height)
-		return;
-#endif
-
-	// Framebuffer destination address.
-	// Use ylookup LUT to avoid multiply with ScreenWidth.
-	// Use columnofs LUT for subwindows?
-
-	//dest = ylookup[dc_yl] + columnofs[dc_x];
-	dest = &topleft[dc->yl * vid.width + dc->x];
-
-	count++;
-
-	// Determine scaling, which is the only mapping to be done.
-	fracstep = dc->iscale;
-	//frac = dc_texturemid + (dc_yl - centery)*fracstep;
-	frac = (dc->texturemid + FixedMul((dc->yl << FRACBITS) - centeryfrac, fracstep))*(!dc->hires);
-
-	// Inner loop that does the actual texture mapping, e.g. a DDA-like scaling.
-	// This is as fast as it gets.
-	{
-		register const UINT8 *source = dc->source;
-		register const UINT8 *brightmap = dc->brightmap;
-		register const lighttable_t *colormap = dc->colormap;
-		register const lighttable_t *fullbright = dc->fullbright;
-		register INT32 heightmask = dc->texheight-1;
-		if (dc->texheight & heightmask)   // not a power of 2 -- killough
-		{
-			heightmask++;
-			heightmask <<= FRACBITS;
-
-			if (frac < 0)
-				while ((frac += heightmask) <  0);
-			else
-				while (frac >= heightmask)
-					frac -= heightmask;
-
-			do
-			{
-				// Re-map color indices from wall texture column
-				//  using a lighting/special effects LUT.
-				// heightmask is the Tutti-Frutti fix
-				if (brightmap != NULL && brightmap[frac>>FRACBITS] == BRIGHTPIXEL)
-				{
-					*dest = fullbright[source[frac>>FRACBITS]];
-				}
-				else
-				{
-					*dest = colormap[source[frac>>FRACBITS]];
-				}
-				dest += vid.width;
-
-				// Avoid overflow.
-				if (fracstep > 0x7FFFFFFF - frac)
-					frac += fracstep - heightmask;
-				else
-					frac += fracstep;
-
-				while (frac >= heightmask)
-					frac -= heightmask;
-			} while (--count);
-		}
-		else
-		{
-			while ((count -= 2) >= 0) // texture height is a power of 2
-			{
-				if (brightmap != NULL && brightmap[(frac>>FRACBITS) & heightmask] == BRIGHTPIXEL)
-				{
-					*dest = fullbright[source[(frac>>FRACBITS) & heightmask]];
-				}
-				else
-				{
-					*dest = colormap[source[(frac>>FRACBITS) & heightmask]];
-				}
-
-				dest += vid.width;
-				frac += fracstep;
-
-				if (brightmap != NULL && brightmap[(frac>>FRACBITS) & heightmask] == BRIGHTPIXEL)
-				{
-					*dest = fullbright[source[(frac>>FRACBITS) & heightmask]];
-				}
-				else
-				{
-					*dest = colormap[source[(frac>>FRACBITS) & heightmask]];
-				}
-
-				dest += vid.width;
-				frac += fracstep;
-			}
-
-			if (count & 1)
-			{
-				if (brightmap != NULL && brightmap[(frac>>FRACBITS) & heightmask] == BRIGHTPIXEL)
-				{
-					*dest = fullbright[source[(frac>>FRACBITS) & heightmask]];
-				}
-				else
-				{
-					*dest = colormap[source[(frac>>FRACBITS) & heightmask]];
-				}
-			}
-		}
-	}
-}
-
-void R_Draw2sMultiPatchColumn_8(drawcolumndata_t* dc)
-{
-	INT32 count;
-	register UINT8 *dest;
-	register fixed_t frac;
-	fixed_t fracstep;
-
-	count = dc->yh - dc->yl;
-
-	if (count < 0) // Zero length, column does not exceed a pixel.
-		return;
-
-#ifdef RANGECHECK
-	if ((unsigned)dc->x >= (unsigned)vid.width || dc->yl < 0 || dc->yh >= vid.height)
-		return;
-#endif
-
-	// Framebuffer destination address.
-	// Use ylookup LUT to avoid multiply with ScreenWidth.
-	// Use columnofs LUT for subwindows?
-
-	//dest = ylookup[dc_yl] + columnofs[dc_x];
-	dest = &topleft[dc->yl * vid.width + dc->x];
-
-	count++;
-
-	// Determine scaling, which is the only mapping to be done.
-	fracstep = dc->iscale;
-	//frac = dc_texturemid + (dc_yl - centery)*fracstep;
-	frac = (dc->texturemid + FixedMul((dc->yl << FRACBITS) - centeryfrac, fracstep))*(!dc->hires);
-
-	// Inner loop that does the actual texture mapping, e.g. a DDA-like scaling.
-	// This is as fast as it gets.
-	{
-		register const UINT8 *source = dc->source;
-		register const UINT8 *brightmap = dc->brightmap;
-		register const lighttable_t *colormap = dc->colormap;
-		register const lighttable_t *fullbright = dc->fullbright;
-		register INT32 heightmask = dc->texheight-1;
-		register UINT8 val;
-		if (dc->texheight & heightmask)   // not a power of 2 -- killough
-		{
-			heightmask++;
-			heightmask <<= FRACBITS;
-
-			if (frac < 0)
-				while ((frac += heightmask) <  0);
-			else
-				while (frac >= heightmask)
-					frac -= heightmask;
-
-			do
-			{
-				// Re-map color indices from wall texture column
-				//  using a lighting/special effects LUT.
-				// heightmask is the Tutti-Frutti fix
-				val = source[frac>>FRACBITS];
-				if (val != TRANSPARENTPIXEL)
-				{
-					if (brightmap != NULL && brightmap[frac>>FRACBITS] == BRIGHTPIXEL)
-					{
-						*dest = fullbright[val];
-					}
-					else
-					{
-						*dest = colormap[val];
-					}
-				}
-
-				dest += vid.width;
-
-				// Avoid overflow.
-				if (fracstep > 0x7FFFFFFF - frac)
-					frac += fracstep - heightmask;
-				else
-					frac += fracstep;
-
-				while (frac >= heightmask)
-					frac -= heightmask;
-			} while (--count);
-		}
-		else
-		{
-			while ((count -= 2) >= 0) // texture height is a power of 2
-			{
-				val = source[(frac>>FRACBITS) & heightmask];
-				if (val != TRANSPARENTPIXEL)
-				{
-					if (brightmap != NULL && brightmap[(frac>>FRACBITS) & heightmask] == BRIGHTPIXEL)
-					{
-						*dest = fullbright[val];
-					}
-					else
-					{
-						*dest = colormap[val];
-					}
-				}
-
-				dest += vid.width;
-				frac += fracstep;
-
-				val = source[(frac>>FRACBITS) & heightmask];
-				if (val != TRANSPARENTPIXEL)
-				{
-					if (brightmap != NULL && brightmap[(frac>>FRACBITS) & heightmask] == BRIGHTPIXEL)
-					{
-						*dest = fullbright[val];
-					}
-					else
-					{
-						*dest = colormap[val];
-					}
-				}
-
-				dest += vid.width;
-				frac += fracstep;
-			}
-
-			if (count & 1)
-			{
-				val = source[(frac>>FRACBITS) & heightmask];
-				if (val != TRANSPARENTPIXEL)
-				{
-					if (brightmap != NULL && brightmap[(frac>>FRACBITS) & heightmask] == BRIGHTPIXEL)
-					{
-						*dest = fullbright[val];
-					}
-					else
-					{
-						*dest = colormap[val];
-					}
-				}
-			}
-		}
-	}
-}
-
-void R_Draw2sMultiPatchTranslucentColumn_8(drawcolumndata_t* dc)
-{
-	INT32 count;
-	register UINT8 *dest;
-	register fixed_t frac;
-	fixed_t fracstep;
-
-	count = dc->yh - dc->yl;
-
-	if (count < 0) // Zero length, column does not exceed a pixel.
-		return;
-
-#ifdef RANGECHECK
-	if ((unsigned)dc->x >= (unsigned)vid.width || dc->yl < 0 || dc->yh >= vid.height)
-		return;
-#endif
-
-	// Framebuffer destination address.
-	// Use ylookup LUT to avoid multiply with ScreenWidth.
-	// Use columnofs LUT for subwindows?
-
-	//dest = ylookup[dc_yl] + columnofs[dc_x];
-	dest = &topleft[dc->yl * vid.width + dc->x];
-
-	count++;
-
-	// Determine scaling, which is the only mapping to be done.
-	fracstep = dc->iscale;
-	//frac = dc_texturemid + (dc_yl - centery)*fracstep;
-	frac = (dc->texturemid + FixedMul((dc->yl << FRACBITS) - centeryfrac, fracstep))*(!dc->hires);
-
-	// Inner loop that does the actual texture mapping, e.g. a DDA-like scaling.
-	// This is as fast as it gets.
-	{
-		register const UINT8 *source = dc->source;
-		register const UINT8 *brightmap = dc->brightmap;
-		register const UINT8 *transmap = dc->transmap;
-		register const lighttable_t *colormap = dc->colormap;
-		register const lighttable_t *fullbright = dc->fullbright;
-		register INT32 heightmask = dc->texheight-1;
-		register UINT8 val;
-		if (dc->texheight & heightmask)   // not a power of 2 -- killough
-		{
-			heightmask++;
-			heightmask <<= FRACBITS;
-
-			if (frac < 0)
-				while ((frac += heightmask) <  0);
-			else
-				while (frac >= heightmask)
-					frac -= heightmask;
-
-			do
-			{
-				// Re-map color indices from wall texture column
-				//  using a lighting/special effects LUT.
-				// heightmask is the Tutti-Frutti fix
-				val = source[frac>>FRACBITS];
-				if (val != TRANSPARENTPIXEL)
-				{
-					if (brightmap != NULL && brightmap[frac>>FRACBITS] == BRIGHTPIXEL)
-					{
-						*dest = *(transmap + (fullbright[val]<<8) + (*dest));
-					}
-					else
-					{
-						*dest = *(transmap + (colormap[val]<<8) + (*dest));
-					}
-				}
-
-				dest += vid.width;
-
-				// Avoid overflow.
-				if (fracstep > 0x7FFFFFFF - frac)
-					frac += fracstep - heightmask;
-				else
-					frac += fracstep;
-
-				while (frac >= heightmask)
-					frac -= heightmask;
-			} while (--count);
-		}
-		else
-		{
-			while ((count -= 2) >= 0) // texture height is a power of 2
-			{
-				val = source[(frac>>FRACBITS) & heightmask];
-				if (val != TRANSPARENTPIXEL)
-				{
-					if (brightmap != NULL && brightmap[(frac>>FRACBITS) & heightmask] == BRIGHTPIXEL)
-					{
-						*dest = *(transmap + (fullbright[val]<<8) + (*dest));
-					}
-					else
-					{
-						*dest = *(transmap + (colormap[val]<<8) + (*dest));
-					}
-				}
-
-				dest += vid.width;
-				frac += fracstep;
-
-				val = source[(frac>>FRACBITS) & heightmask];
-				if (val != TRANSPARENTPIXEL)
-				{
-					if (brightmap != NULL && brightmap[(frac>>FRACBITS) & heightmask] == BRIGHTPIXEL)
-					{
-						*dest = *(transmap + (fullbright[val]<<8) + (*dest));
-					}
-					else
-					{
-						*dest = *(transmap + (colormap[val]<<8) + (*dest));
-					}
-				}
-
-				dest += vid.width;
-				frac += fracstep;
-			}
-			if (count & 1)
-			{
-				val = source[(frac>>FRACBITS) & heightmask];
-				if (val != TRANSPARENTPIXEL)
-				{
-					if (brightmap != NULL && brightmap[(frac>>FRACBITS) & heightmask] == BRIGHTPIXEL)
-					{
-						*dest = *(transmap + (fullbright[val]<<8) + (*dest));
-					}
-					else
-					{
-						*dest = *(transmap + (colormap[val]<<8) + (*dest));
-					}
-				}
-			}
-		}
-	}
-}
-
-/**	\brief The R_DrawShadeColumn_8 function
-	Experiment to make software go faster. Taken from the Boom source
-*/
-void R_DrawShadeColumn_8(drawcolumndata_t* dc)
-{
-	register INT32 count;
-	register UINT8 *dest;
-	register fixed_t frac, fracstep;
-
-	// check out coords for src*
-	if ((dc->yl < 0) || (dc->x >= vid.width))
-		return;
-
-	count = dc->yh - dc->yl;
-	if (count < 0)
-		return;
-
-#ifdef RANGECHECK
-	if ((unsigned)dc->x >= (unsigned)vid.width || dc->yl < 0 || dc->yh >= vid.height)
-		I_Error("R_DrawShadeColumn_8: %d to %d at %d", dc->yl, dc->yh, dc->x);
-#endif
-
-	// FIXME. As above.
-	//dest = ylookup[dc_yl] + columnofs[dc_x];
-	dest = &topleft[dc->yl * vid.width + dc->x];
-
-	// Looks familiar.
-	fracstep = dc->iscale;
-	//frac = dc_texturemid + (dc_yl - centery)*fracstep;
-	frac = (dc->texturemid + FixedMul((dc->yl << FRACBITS) - centeryfrac, fracstep))*(!dc->hires);
-
-	// Here we do an additional index re-mapping.
-	do
-	{
-		*dest = colormaps[(dc->source[frac>>FRACBITS] <<8) + (*dest)];
-		dest += vid.width;
-		frac += fracstep;
-	} while (count--);
-}
-
-/**	\brief The R_DrawTranslucentColumn_8 function
-	I've made an asm routine for the transparency, because it slows down
-	a lot in 640x480 with big sprites (bfg on all screen, or transparent
-	walls on fullscreen)
-*/
-void R_DrawTranslucentColumn_8(drawcolumndata_t* dc)
-{
-	register INT32 count;
-	register UINT8 *dest;
-	register fixed_t frac, fracstep;
-
-	count = dc->yh - dc->yl + 1;
-
-	if (count <= 0) // Zero length, column does not exceed a pixel.
-		return;
-
-#ifdef RANGECHECK
-	if ((unsigned)dc->x >= (unsigned)vid.width || dc->yl < 0 || dc->yh >= vid.height)
-		I_Error("R_DrawTranslucentColumn_8: %d to %d at %d", dc->yl, dc->yh, dc->x);
-#endif
-
-	// FIXME. As above.
-	//dest = ylookup[dc_yl] + columnofs[dc_x];
-	dest = &topleft[dc->yl * vid.width + dc->x];
-
-	// Looks familiar.
-	fracstep = dc->iscale;
-	//frac = dc_texturemid + (dc_yl - centery)*fracstep;
-	frac = (dc->texturemid + FixedMul((dc->yl << FRACBITS) - centeryfrac, fracstep))*(!dc->hires);
-
-	// Inner loop that does the actual texture mapping, e.g. a DDA-like scaling.
-	// This is as fast as it gets.
-	{
-		register const UINT8 *source = dc->source;
-		register const UINT8 *brightmap = dc->brightmap;
-		register const UINT8 *transmap = dc->transmap;
-		register const lighttable_t *colormap = dc->colormap;
-		register const lighttable_t *fullbright = dc->fullbright;
-		register INT32 heightmask = dc->texheight - 1;
-		if (dc->texheight & heightmask)
-		{
-			heightmask++;
-			heightmask <<= FRACBITS;
-
-			if (frac < 0)
-				while ((frac += heightmask) < 0)
-					;
-			else
-				while (frac >= heightmask)
-					frac -= heightmask;
-
-			do
-			{
-				// Re-map color indices from wall texture column
-				// using a lighting/special effects LUT.
-				// heightmask is the Tutti-Frutti fix
-				if (brightmap != NULL && brightmap[frac>>FRACBITS] == BRIGHTPIXEL)
-				{
-					*dest = *(transmap + (fullbright[source[frac>>FRACBITS]]<<8) + (*dest));
-				}
-				else
-				{
-					*dest = *(transmap + (colormap[source[frac>>FRACBITS]]<<8) + (*dest));
-				}
-				dest += vid.width;
-				if ((frac += fracstep) >= heightmask)
-					frac -= heightmask;
-			}
-			while (--count);
-		}
-		else
-		{
-			while ((count -= 2) >= 0) // texture height is a power of 2
-			{
-				if (brightmap != NULL && brightmap[(frac>>FRACBITS)&heightmask] == BRIGHTPIXEL)
-				{
-					*dest = *(transmap + (fullbright[source[(frac>>FRACBITS)&heightmask]]<<8) + (*dest));
-				}
-				else
-				{
-					*dest = *(transmap + (colormap[source[(frac>>FRACBITS)&heightmask]]<<8) + (*dest));
-				}
-				dest += vid.width;
-				frac += fracstep;
-
-				if (brightmap != NULL && brightmap[(frac>>FRACBITS)&heightmask] == BRIGHTPIXEL)
-				{
-					*dest = *(transmap + (fullbright[source[(frac>>FRACBITS)&heightmask]]<<8) + (*dest));
-				}
-				else
-				{
-					*dest = *(transmap + (colormap[source[(frac>>FRACBITS)&heightmask]]<<8) + (*dest));
-				}
-				dest += vid.width;
-				frac += fracstep;
-			}
-			if (count & 1)
-			{
-				if (brightmap != NULL && brightmap[(frac>>FRACBITS)&heightmask] == BRIGHTPIXEL)
-				{
-					*dest = *(transmap + (fullbright[source[(frac>>FRACBITS)&heightmask]]<<8) + (*dest));
-				}
-				else
-				{
-					*dest = *(transmap + (colormap[source[(frac>>FRACBITS)&heightmask]]<<8) + (*dest));
-				}
-			}
-		}
-	}
-}
-
-// Hack: A cut-down copy of R_DrawTranslucentColumn_8 that does not read texture
-// data since something about calculating the texture reading address for drop shadows is broken.
-// dc_texturemid and dc_iscale get wrong values for drop shadows, however those are not strictly
-// needed for the current design of the shadows, so this function bypasses the issue
-// by not using those variables at all.
-void R_DrawDropShadowColumn_8(drawcolumndata_t* dc)
-{
-	register INT32 count;
-	register UINT8 *dest;
-
-	count = dc->yh - dc->yl + 1;
-
-	if (count <= 0) // Zero length, column does not exceed a pixel.
-		return;
-
-	dest = &topleft[dc->yl*vid.width + dc->x];
-
-	{
-		register const UINT8 *transmap_offset = dc->transmap + (dc->shadowcolor << 8);
-		while ((count -= 2) >= 0)
-		{
-			*dest = *(transmap_offset + (*dest));
-			dest += vid.width;
-			*dest = *(transmap_offset + (*dest));
-			dest += vid.width;
-		}
-		if (count & 1)
-			*dest = *(transmap_offset + (*dest));
-	}
-}
-
-/**	\brief The R_DrawTranslatedTranslucentColumn_8 function
-	Spiffy function. Not only does it colormap a sprite, but does translucency as well.
-	Uber-kudos to Cyan Helkaraxe
-*/
-void R_DrawTranslatedTranslucentColumn_8(drawcolumndata_t* dc)
-{
-	register INT32 count;
-	register UINT8 *dest;
-	register fixed_t frac, fracstep;
-
-	count = dc->yh - dc->yl + 1;
-
-	if (count <= 0) // Zero length, column does not exceed a pixel.
-		return;
-
-	// FIXME. As above.
-	//dest = ylookup[dc_yl] + columnofs[dc_x];
-	dest = &topleft[dc->yl * vid.width + dc->x];
-
-	// Looks familiar.
-	fracstep = dc->iscale;
-	//frac = dc_texturemid + (dc_yl - centery)*fracstep;
-	frac = (dc->texturemid + FixedMul((dc->yl << FRACBITS) - centeryfrac, fracstep))*(!dc->hires);
-
-	// Inner loop that does the actual texture mapping, e.g. a DDA-like scaling.
-	// This is as fast as it gets.
-	{
-		register INT32 heightmask = dc->texheight - 1;
-		if (dc->texheight & heightmask)
-		{
-			heightmask++;
-			heightmask <<= FRACBITS;
-
-			if (frac < 0)
-				while ((frac += heightmask) < 0)
-					;
-			else
-				while (frac >= heightmask)
-					frac -= heightmask;
-
-			do
-			{
-				// Re-map color indices from wall texture column
-				//  using a lighting/special effects LUT.
-				// heightmask is the Tutti-Frutti fix
-
-				if (dc->brightmap != NULL && dc->brightmap[frac>>FRACBITS] == BRIGHTPIXEL)
-				{
-					*dest = *(dc->transmap + (dc->fullbright[dc->translation[dc->source[frac>>FRACBITS]]]<<8) + (*dest));
-				}
-				else
-				{
-					*dest = *(dc->transmap + (dc->colormap[dc->translation[dc->source[frac>>FRACBITS]]]<<8) + (*dest));
-				}
-
-				dest += vid.width;
-				if ((frac += fracstep) >= heightmask)
-					frac -= heightmask;
-			}
-			while (--count);
-		}
-		else
-		{
-			while ((count -= 2) >= 0) // texture height is a power of 2
-			{
-				if (dc->brightmap != NULL && dc->brightmap[(frac>>FRACBITS)&heightmask] == BRIGHTPIXEL)
-				{
-					*dest = *(dc->transmap + (dc->fullbright[dc->translation[dc->source[(frac>>FRACBITS)&heightmask]]]<<8) + (*dest));
-				}
-				else
-				{
-					*dest = *(dc->transmap + (dc->colormap[dc->translation[dc->source[(frac>>FRACBITS)&heightmask]]]<<8) + (*dest));
-				}
-
-				dest += vid.width;
-				frac += fracstep;
-
-				if (dc->brightmap != NULL && dc->brightmap[(frac>>FRACBITS)&heightmask] == BRIGHTPIXEL)
-				{
-					*dest = *(dc->transmap + (dc->fullbright[dc->translation[dc->source[(frac>>FRACBITS)&heightmask]]]<<8) + (*dest));
-				}
-				else
-				{
-					*dest = *(dc->transmap + (dc->colormap[dc->translation[dc->source[(frac>>FRACBITS)&heightmask]]]<<8) + (*dest));
-				}
-
-				dest += vid.width;
-				frac += fracstep;
-			}
-			if (count & 1)
-			{
-				if (dc->brightmap != NULL && dc->brightmap[(frac>>FRACBITS)&heightmask] == BRIGHTPIXEL)
-				{
-					*dest = *(dc->transmap + (dc->fullbright[dc->translation[dc->source[(frac>>FRACBITS)&heightmask]]]<<8) + (*dest));
-				}
-				else
-				{
-					*dest = *(dc->transmap + (dc->colormap[dc->translation[dc->source[(frac>>FRACBITS)&heightmask]]]<<8) + (*dest));
-				}
-			}
-		}
-	}
-}
-
-/**	\brief The R_DrawTranslatedColumn_8 function
-	Draw columns up to 128 high but remap the green ramp to other colors
-
-  \warning STILL NOT IN ASM, TO DO..
-*/
-void R_DrawTranslatedColumn_8(drawcolumndata_t* dc)
-{
-	register INT32 count;
-	register UINT8 *dest;
-	register fixed_t frac, fracstep;
-
-	count = dc->yh - dc->yl;
-	if (count < 0)
-		return;
-
-#ifdef RANGECHECK
-	if ((unsigned)dc->x >= (unsigned)vid.width || dc->yl < 0 || dc->yh >= vid.height)
-		I_Error("R_DrawTranslatedColumn_8: %d to %d at %d", dc->yl, dc->yh, dc->x);
-#endif
-
-	// FIXME. As above.
-	//dest = ylookup[dc_yl] + columnofs[dc_x];
-	dest = &topleft[dc->yl*vid.width + dc->x];
-
-	// Looks familiar.
-	fracstep = dc->iscale;
-	//frac = dc_texturemid + (dc_yl-centery)*fracstep;
-	frac = (dc->texturemid + FixedMul((dc->yl << FRACBITS) - centeryfrac, fracstep))*(!dc->hires);
-
-	// Here we do an additional index re-mapping.
-	do
-	{
-		// Translation tables are used
-		//  to map certain colorramps to other ones,
-		//  used with PLAY sprites.
-		// Thus the "green" ramp of the player 0 sprite
-		//  is mapped to gray, red, black/indigo.
-		if (dc->brightmap != NULL && dc->brightmap[frac>>FRACBITS] == BRIGHTPIXEL)
-		{
-			*dest = dc->fullbright[dc->translation[dc->source[frac>>FRACBITS]]];
-		}
-		else
-		{
-			*dest = dc->colormap[dc->translation[dc->source[frac>>FRACBITS]]];
-		}
-
-		dest += vid.width;
-
-		frac += fracstep;
-	} while (count--);
-}
-
-// ==========================================================================
-// SPANS
-// ==========================================================================
-
-#define SPANSIZE 16
-#define INVSPAN 0.0625f
-
-// <Callum> 4194303 = (2048x2048)-1 (2048x2048 is maximum flat size)
-#define MAXFLATBYTES 4194303
-
-/**	\brief The R_DrawSpan_8 function
-	Draws the actual span.
-*/
-void R_DrawSpan_8 (drawspandata_t* ds)
-{
-	fixed_t xposition;
-	fixed_t yposition;
-	fixed_t xstep, ystep;
-	UINT32 bit;
-
-	UINT8 *source;
-	UINT8 *brightmap;
-	UINT8 *colormap;
-	UINT8 *fullbright;
-	UINT8 *dest;
-	const UINT8 *deststop = screens[0] + vid.rowbytes * vid.height;
-
-	size_t count = (ds->x2 - ds->x1 + 1);
-	size_t i;
-
-	xposition = ds->xfrac; yposition = ds->yfrac;
-	xstep = ds->xstep; ystep = ds->ystep;
-
-	// SoM: we only need 6 bits for the integer part (0 thru 63) so the rest
-	// can be used for the fraction part. This allows calculation of the memory address in the
-	// texture with two shifts, an OR and one AND. (see below)
-	// for texture sizes > 64 the amount of precision we can allow will decrease, but only by one
-	// bit per power of two (obviously)
-	// Ok, because I was able to eliminate the variable spot below, this function is now FASTER
-	// than the original span renderer. Whodathunkit?
-	xposition <<= ds->nflatshiftup; yposition <<= ds->nflatshiftup;
-	xstep <<= ds->nflatshiftup; ystep <<= ds->nflatshiftup;
-
-	source = ds->source;
-	brightmap = ds->brightmap;
-	colormap = ds->colormap;
-	fullbright = ds->fullbright;
-	dest = ylookup[ds->y] + columnofs[ds->x1];
-
-	if (dest+8 > deststop)
-		return;
-
-	while (count >= 8)
-	{
-		// SoM: Why didn't I see this earlier? the spot variable is a waste now because we don't
-		// have the uber complicated math to calculate it now, so that was a memory write we didn't
-		// need!
-
-		for (i = 0; i < 8; i++)
-		{
-			bit = (((UINT32)yposition >> ds->nflatyshift) & ds->nflatmask) | ((UINT32)xposition >> ds->nflatxshift);
-			if (brightmap != NULL && brightmap[bit] == BRIGHTPIXEL)
-			{
-				dest[i] = fullbright[source[bit]];
-			}
-			else
-			{
-				dest[i] = colormap[source[bit]];
-			}
-			xposition += xstep;
-			yposition += ystep;
-		}
-
-		dest += 8;
-		count -= 8;
-	}
-	while (count-- && dest <= deststop)
-	{
-		bit = (((UINT32)yposition >> ds->nflatyshift) & ds->nflatmask) | ((UINT32)xposition >> ds->nflatxshift);
-		if (brightmap != NULL && brightmap[bit] == BRIGHTPIXEL)
-		{
-			*dest = fullbright[source[bit]];
-		}
-		else
-		{
-			*dest = colormap[source[bit]];
-		}
-
-		dest++;
-		xposition += xstep;
-		yposition += ystep;
-	}
-}
-
-// R_CalcTiltedLighting
-// Exactly what it says on the tin. I wish I wasn't too lazy to explain things properly.
-void R_CalcTiltedLighting(INT32 *lightbuffer, INT32 x1, INT32 x2, fixed_t start, fixed_t end)
-{
-	// ZDoom uses a different lighting setup to us, and I couldn't figure out how to adapt their version
-	// of this function. Here's my own.
-	INT32 left = x1, right = x2;
-	fixed_t step = (end-start)/(x2 - x1 + 1);
-	INT32 i;
-
-	// I wanna do some optimizing by checking for out-of-range segments on either side to fill in all at once,
-	// but I'm too bad at coding to not crash the game trying to do that. I guess this is fast enough for now...
-
-	for (i = left; i <= right; i++) {
-		lightbuffer[i] = (start += step) >> FRACBITS;
-		if (lightbuffer[i] < 0)
-			lightbuffer[i] = 0;
-		else if (lightbuffer[i] >= MAXLIGHTSCALE)
-			lightbuffer[i] = MAXLIGHTSCALE-1;
-	}
-}
-
-/**	\brief The R_DrawTiltedSpan_8 function
-	Draw slopes! Holy sheit!
-*/
-void R_DrawTiltedSpan_8(drawspandata_t* ds)
-{
-	// x1, x2 = ds_x1, ds_x2
-	int width = ds->x2 - ds->x1;
-	double iz, uz, vz;
-	UINT32 u, v;
-	int i;
-
-	UINT8 *source;
-	UINT8 *brightmap;
-	UINT8 *colormap;
-	UINT8 *fullbright;
-	UINT8 *dest;
-
-	double startz, startu, startv;
-	double izstep, uzstep, vzstep;
-	double endz, endu, endv;
-	UINT32 stepu, stepv;
-	UINT32 bit;
-	INT32 tiltlighting[MAXVIDWIDTH];
-
-	iz = ds->szp.z + ds->szp.y*(centery-ds->y) + ds->szp.x*(ds->x1-centerx);
-
-	// Lighting is simple. It's just linear interpolation from start to end
-	{
-		float planelightfloat = PLANELIGHTFLOAT;
-		float lightstart, lightend;
-
-		lightend = (iz + ds->szp.x*width) * planelightfloat;
-		lightstart = iz * planelightfloat;
-
-		R_CalcTiltedLighting(tiltlighting, ds->x1, ds->x2, FLOAT_TO_FIXED(lightstart), FLOAT_TO_FIXED(lightend));
-		//CONS_Printf("tilted lighting %f to %f (foc %f)\n", lightstart, lightend, focallengthf);
-	}
-
-	uz = ds->sup.z + ds->sup.y*(centery-ds->y) + ds->sup.x*(ds->x1-centerx);
-	vz = ds->svp.z + ds->svp.y*(centery-ds->y) + ds->svp.x*(ds->x1-centerx);
-
-	dest = ylookup[ds->y] + columnofs[ds->x1];
-
-	source = ds->source;
-	brightmap = ds->brightmap;
-	//colormap = ds_colormap;
-	fullbright = ds->fullbright;
-
-#if 0	// The "perfect" reference version of this routine. Pretty slow.
-		// Use it only to see how things are supposed to look.
-	i = 0;
-	do
-	{
-		double z = 1.f/iz;
-		u = (INT64)(uz*z);
-		v = (INT64)(vz*z);
-		bit = ((v >> nflatyshift) & nflatmask) | (u >> nflatxshift);
-		if (brightmap != NULL && brightmap[bit] == BRIGHTPIXEL)
-		{
-			*dest = fullbright[source[bit]];
-		}
-		else
-		{
-			colormap = planezlight[tiltlighting[ds_x1]] + (ds_colormap - colormaps);
-			*dest = colormap[source[bit]];
-		}
-		dest++;
-		ds_x1++;
-		iz += ds_szp->x;
-		uz += ds_sup->x;
-		vz += ds_svp->x;
-	} while (--width >= 0);
-#else
-	startz = 1.f/iz;
-	startu = uz*startz;
-	startv = vz*startz;
-
-	izstep = ds->szp.x * SPANSIZE;
-	uzstep = ds->sup.x * SPANSIZE;
-	vzstep = ds->svp.x * SPANSIZE;
-	//x1 = 0;
-	width++;
-
-	while (width >= SPANSIZE)
-	{
-		iz += izstep;
-		uz += uzstep;
-		vz += vzstep;
-
-		endz = 1.f/iz;
-		endu = uz*endz;
-		endv = vz*endz;
-		stepu = (INT64)((endu - startu) * INVSPAN);
-		stepv = (INT64)((endv - startv) * INVSPAN);
-		u = (INT64)(startu);
-		v = (INT64)(startv);
-
-		for (i = SPANSIZE-1; i >= 0; i--)
-		{
-			bit = ((v >> ds->nflatyshift) & ds->nflatmask) | (u >> ds->nflatxshift);
-			if (brightmap != NULL && brightmap[bit] == BRIGHTPIXEL)
-			{
-				*dest = fullbright[source[bit]];
-			}
-			else
-			{
-				colormap = ds->planezlight[tiltlighting[ds->x1]] + (ds->colormap - colormaps);
-				*dest = colormap[source[bit]];
-			}
-			dest++;
-			ds->x1++;
-			u += stepu;
-			v += stepv;
-		}
-		startu = endu;
-		startv = endv;
-		width -= SPANSIZE;
-	}
-	if (width > 0)
-	{
-		if (width == 1)
-		{
-			u = (INT64)(startu);
-			v = (INT64)(startv);
-			bit = ((v >> ds->nflatyshift) & ds->nflatmask) | (u >> ds->nflatxshift);
-			if (brightmap != NULL && brightmap[bit] == BRIGHTPIXEL)
-			{
-				*dest = fullbright[source[bit]];
-			}
-			else
-			{
-				colormap = ds->planezlight[tiltlighting[ds->x1]] + (ds->colormap - colormaps);
-				*dest = colormap[source[bit]];
-			}
-			ds->x1++;
-		}
-		else
-		{
-			double left = width;
-			iz += ds->szp.x * left;
-			uz += ds->sup.x * left;
-			vz += ds->svp.x * left;
-
-			endz = 1.f/iz;
-			endu = uz*endz;
-			endv = vz*endz;
-			left = 1.f/left;
-			stepu = (INT64)((endu - startu) * left);
-			stepv = (INT64)((endv - startv) * left);
-			u = (INT64)(startu);
-			v = (INT64)(startv);
-
-			for (; width != 0; width--)
-			{
-				bit = ((v >> ds->nflatyshift) & ds->nflatmask) | (u >> ds->nflatxshift);
-				if (brightmap != NULL && brightmap[bit] == BRIGHTPIXEL)
-				{
-					*dest = fullbright[source[bit]];
-				}
-				else
-				{
-					colormap = ds->planezlight[tiltlighting[ds->x1]] + (ds->colormap - colormaps);
-					*dest = colormap[source[bit]];
-				}
-				dest++;
-				ds->x1++;
-				u += stepu;
-				v += stepv;
-			}
-		}
-	}
-#endif
-}
-
-/**	\brief The R_DrawTiltedTranslucentSpan_8 function
-	Like DrawTiltedSpan, but translucent
-*/
-void R_DrawTiltedTranslucentSpan_8(drawspandata_t* ds)
-{
-	TracyCZone(__zone, true);
-	// x1, x2 = ds_x1, ds_x2
-	int width = ds->x2 - ds->x1;
-	double iz, uz, vz;
-	UINT32 u, v;
-	int i;
-
-	UINT8 *source;
-	UINT8 *brightmap;
-	UINT8 *colormap;
-	UINT8 *fullbright;
-	UINT8 *dest;
-
-	double startz, startu, startv;
-	double izstep, uzstep, vzstep;
-	double endz, endu, endv;
-	UINT32 stepu, stepv;
-	UINT32 bit;
-	INT32 tiltlighting[MAXVIDWIDTH];
-
-	INT32 x1 = ds->x1;
-	const INT32 nflatxshift = ds->nflatxshift;
-	const INT32 nflatyshift = ds->nflatyshift;
-	const INT32 nflatmask = ds->nflatmask;
-	UINT8 *transmap = ds->transmap;
-	lighttable_t **planezlight = ds->planezlight;
-	lighttable_t *ds_colormap = ds->colormap;
-
-	iz = ds->szp.z + ds->szp.y*(centery-ds->y) + ds->szp.x*(ds->x1-centerx);
-
-	// Lighting is simple. It's just linear interpolation from start to end
-	{
-		float planelightfloat = PLANELIGHTFLOAT;
-		float lightstart, lightend;
-
-		lightend = (iz + ds->szp.x*width) * planelightfloat;
-		lightstart = iz * planelightfloat;
-
-		R_CalcTiltedLighting(tiltlighting, ds->x1, ds->x2, FLOAT_TO_FIXED(lightstart), FLOAT_TO_FIXED(lightend));
-		//CONS_Printf("tilted lighting %f to %f (foc %f)\n", lightstart, lightend, focallengthf);
-	}
-
-	uz = ds->sup.z + ds->sup.y*(centery-ds->y) + ds->sup.x*(ds->x1-centerx);
-	vz = ds->svp.z + ds->svp.y*(centery-ds->y) + ds->svp.x*(ds->x1-centerx);
-
-	dest = ylookup[ds->y] + columnofs[ds->x1];
-
-	source = ds->source;
-	brightmap = ds->brightmap;
-	//colormap = ds_colormap;
-	fullbright = ds->fullbright;
-
-#if 0	// The "perfect" reference version of this routine. Pretty slow.
-		// Use it only to see how things are supposed to look.
-	i = 0;
-	do
-	{
-		double z = 1.f/iz;
-		u = (INT64)(uz*z);
-		v = (INT64)(vz*z);
-
-		bit = ((v >> nflatyshift) & nflatmask) | (u >> nflatxshift);
-		if (brightmap != NULL && brightmap[bit] == BRIGHTPIXEL)
-		{
-			*dest = *(ds_transmap + (fullbright[source[bit]] << 8) + *dest);
-		}
-		else
-		{
-			colormap = planezlight[tiltlighting[ds_x1]] + (ds_colormap - colormaps);
-			*dest = *(ds_transmap + (colormap[source[bit]] << 8) + *dest);
-		}
-		dest++;
-		ds_x1++;
-		iz += ds_szp->x;
-		uz += ds_sup->x;
-		vz += ds_svp->x;
-	} while (--width >= 0);
-#else
-	startz = 1.f/iz;
-	startu = uz*startz;
-	startv = vz*startz;
-
-	izstep = ds->szp.x * SPANSIZE;
-	uzstep = ds->sup.x * SPANSIZE;
-	vzstep = ds->svp.x * SPANSIZE;
-	//x1 = 0;
-	width++;
-
-	while (width >= SPANSIZE)
-	{
-		iz += izstep;
-		uz += uzstep;
-		vz += vzstep;
-
-		endz = 1.f/iz;
-		endu = uz*endz;
-		endv = vz*endz;
-		stepu = (INT64)((endu - startu) * INVSPAN);
-		stepv = (INT64)((endv - startv) * INVSPAN);
-		u = (INT64)(startu);
-		v = (INT64)(startv);
-
-		x1 = ds->x1;
-
-		for (i = 0; i < SPANSIZE; i++)
-		{
-			bit = (((v + stepv * i) >> nflatyshift) & nflatmask) | ((u + stepu * i) >> nflatxshift);
-			if (brightmap != NULL && brightmap[bit] == BRIGHTPIXEL)
-			{
-				dest[i] = *(transmap + (fullbright[source[bit]] << 8) + dest[i]);
-			}
-			else
-			{
-				colormap = planezlight[tiltlighting[x1 + i]] + (ds_colormap - colormaps);
-				dest[i] = *(transmap + (colormap[source[bit]] << 8) + dest[i]);
-			}
-		}
-		ds->x1 += SPANSIZE;
-		dest += SPANSIZE;
-		startu = endu;
-		startv = endv;
-		width -= SPANSIZE;
-	}
-	if (width > 0)
-	{
-		if (width == 1)
-		{
-			u = (INT64)(startu);
-			v = (INT64)(startv);
-			bit = ((v >> nflatyshift) & nflatmask) | (u >> nflatxshift);
-			if (brightmap != NULL && brightmap[bit] == BRIGHTPIXEL)
-			{
-				*dest = *(transmap + (fullbright[source[bit]] << 8) + *dest);
-			}
-			else
-			{
-				colormap = planezlight[tiltlighting[ds->x1]] + (ds_colormap - colormaps);
-				*dest = *(transmap + (colormap[source[bit]] << 8) + *dest);
-			}
-			ds->x1++;
-		}
-		else
-		{
-			double left = width;
-			iz += ds->szp.x * left;
-			uz += ds->sup.x * left;
-			vz += ds->svp.x * left;
-
-			endz = 1.f/iz;
-			endu = uz*endz;
-			endv = vz*endz;
-			left = 1.f/left;
-			stepu = (INT64)((endu - startu) * left);
-			stepv = (INT64)((endv - startv) * left);
-			u = (INT64)(startu);
-			v = (INT64)(startv);
-
-			for (; width != 0; width--)
-			{
-				bit = ((v >> nflatyshift) & nflatmask) | (u >> nflatxshift);;
-				if (brightmap != NULL && brightmap[bit] == BRIGHTPIXEL)
-				{
-					*dest = *(transmap + (fullbright[source[bit]] << 8) + *dest);
-				}
-				else
-				{
-					colormap = planezlight[tiltlighting[ds->x1]] + (ds_colormap - colormaps);
-					*dest = *(transmap + (colormap[source[bit]] << 8) + *dest);
-				}
-				dest++;
-				ds->x1++;
-				u += stepu;
-				v += stepv;
-			}
-		}
-	}
-#endif
-	TracyCZoneEnd(__zone);
-}
-
-/**	\brief The R_DrawTiltedTranslucentWaterSpan_8 function
-	Like DrawTiltedTranslucentSpan, but for water
-*/
-void R_DrawTiltedTranslucentWaterSpan_8(drawspandata_t* ds)
-{
-	TracyCZone(__zone, true);
-	// x1, x2 = ds_x1, ds_x2
-	int width = ds->x2 - ds->x1;
-	double iz, uz, vz;
-	UINT32 u, v;
-	int i;
-
-	UINT8 *source;
-	UINT8 *brightmap;
-	UINT8 *colormap;
-	UINT8 *fullbright;
-	UINT8 *dest;
-	UINT8 *dsrc;
-
-	double startz, startu, startv;
-	double izstep, uzstep, vzstep;
-	double endz, endu, endv;
-	UINT32 stepu, stepv;
-	UINT32 bit;
-	INT32 tiltlighting[MAXVIDWIDTH];
-
-	INT32 x1 = ds->x1;
-	const INT32 nflatxshift = ds->nflatxshift;
-	const INT32 nflatyshift = ds->nflatyshift;
-	const INT32 nflatmask = ds->nflatmask;
-	UINT8 *transmap = ds->transmap;
-	lighttable_t **planezlight = ds->planezlight;
-	lighttable_t *ds_colormap = ds->colormap;
-
-	iz = ds->szp.z + ds->szp.y*(centery-ds->y) + ds->szp.x*(ds->x1-centerx);
-
-	// Lighting is simple. It's just linear interpolation from start to end
-	{
-		float planelightfloat = PLANELIGHTFLOAT;
-		float lightstart, lightend;
-
-		lightend = (iz + ds->szp.x*width) * planelightfloat;
-		lightstart = iz * planelightfloat;
-
-		R_CalcTiltedLighting(tiltlighting, ds->x1, ds->x2, FLOAT_TO_FIXED(lightstart), FLOAT_TO_FIXED(lightend));
-		//CONS_Printf("tilted lighting %f to %f (foc %f)\n", lightstart, lightend, focallengthf);
-	}
-
-	uz = ds->sup.z + ds->sup.y*(centery-ds->y) + ds->sup.x*(ds->x1-centerx);
-	vz = ds->svp.z + ds->svp.y*(centery-ds->y) + ds->svp.x*(ds->x1-centerx);
-
-	dest = ylookup[ds->y] + columnofs[ds->x1];
-	dsrc = screens[1] + (ds->y+ds->bgofs)*vid.width + ds->x1;
-	source = ds->source;
-	brightmap = ds->brightmap;
-	//colormap = ds_colormap;
-	fullbright = ds->fullbright;
-
-#if 0	// The "perfect" reference version of this routine. Pretty slow.
-		// Use it only to see how things are supposed to look.
-	i = 0;
-	do
-	{
-		double z = 1.f/iz;
-		u = (INT64)(uz*z);
-		v = (INT64)(vz*z);
-
-		bit = ((v >> nflatyshift) & nflatmask) | (u >> nflatxshift);
-		if (brightmap != NULL && brightmap[bit] == BRIGHTPIXEL)
-		{
-			*dest = *(ds_transmap + (fullbright[source[bit]] << 8) + *dsrc);
-		}
-		else
-		{
-			colormap = planezlight[tiltlighting[ds_x1]] + (ds_colormap - colormaps);
-			*dest = *(ds_transmap + (colormap[source[bit]] << 8) + *dsrc);
-		}
-		dest++;
-		ds_x1++;
-		dsrc++;
-		iz += ds_szp->x;
-		uz += ds_sup->x;
-		vz += ds_svp->x;
-	} while (--width >= 0);
-#else
-	startz = 1.f/iz;
-	startu = uz*startz;
-	startv = vz*startz;
-
-	izstep = ds->szp.x * SPANSIZE;
-	uzstep = ds->sup.x * SPANSIZE;
-	vzstep = ds->svp.x * SPANSIZE;
-	//x1 = 0;
-	width++;
-
-	while (width >= SPANSIZE)
-	{
-		iz += izstep;
-		uz += uzstep;
-		vz += vzstep;
-
-		endz = 1.f/iz;
-		endu = uz*endz;
-		endv = vz*endz;
-		stepu = (INT64)((endu - startu) * INVSPAN);
-		stepv = (INT64)((endv - startv) * INVSPAN);
-		u = (INT64)(startu);
-		v = (INT64)(startv);
-
-		x1 = ds->x1;
-
-		for (i = 0; i < SPANSIZE; i++)
-		{
-			bit = (((v + stepv * i) >> nflatyshift) & nflatmask) | ((u + stepu * i) >> nflatxshift);
-			if (brightmap != NULL && brightmap[bit] == BRIGHTPIXEL)
-			{
-				dest[i] = transmap[(fullbright[source[bit]] << 8) + dsrc[i]];
-			}
-			else
-			{
-				colormap = planezlight[tiltlighting[x1 + i]] + (ds_colormap - colormaps);
-				dest[i] = transmap[(colormap[source[bit]] << 8) + dsrc[i]];
-			}
-		}
-		ds->x1 += SPANSIZE;
-		dest += SPANSIZE;
-		dsrc += SPANSIZE;
-		startu = endu;
-		startv = endv;
-		width -= SPANSIZE;
-	}
-	if (width > 0)
-	{
-		if (width == 1)
-		{
-			u = (INT64)(startu);
-			v = (INT64)(startv);
-			bit = ((v >> nflatyshift) & nflatmask) | (u >> nflatxshift);
-			if (brightmap != NULL && brightmap[bit] == BRIGHTPIXEL)
-			{
-				*dest = *(transmap + (fullbright[source[bit]] << 8) + *dsrc);
-			}
-			else
-			{
-				colormap = planezlight[tiltlighting[ds->x1]] + (ds_colormap - colormaps);
-				*dest = *(transmap + (colormap[source[bit]] << 8) + *dsrc);
-			}
-			ds->x1++;
-		}
-		else
-		{
-			double left = width;
-			iz += ds->szp.x * left;
-			uz += ds->sup.x * left;
-			vz += ds->svp.x * left;
-
-			endz = 1.f/iz;
-			endu = uz*endz;
-			endv = vz*endz;
-			left = 1.f/left;
-			stepu = (INT64)((endu - startu) * left);
-			stepv = (INT64)((endv - startv) * left);
-			u = (INT64)(startu);
-			v = (INT64)(startv);
-
-			for (; width != 0; width--)
-			{
-				bit = ((v >> nflatyshift) & nflatmask) | (u >> nflatxshift);
-				if (brightmap != NULL && brightmap[bit] == BRIGHTPIXEL)
-				{
-					*dest = *(transmap + (fullbright[source[bit]] << 8) + *dsrc);
-				}
-				else
-				{
-					colormap = planezlight[tiltlighting[ds->x1]] + (ds_colormap - colormaps);
-					*dest = *(transmap + (colormap[source[bit]] << 8) + *dsrc);
-				}
-				dest++;
-				ds->x1++;
-				dsrc++;
-				u += stepu;
-				v += stepv;
-			}
-		}
-	}
-#endif
-	TracyCZoneEnd(__zone);
-}
-
-void R_DrawTiltedSplat_8(drawspandata_t* ds)
-{
-	// x1, x2 = ds_x1, ds_x2
-	int width = ds->x2 - ds->x1;
-	double iz, uz, vz;
-	UINT32 u, v;
-	int i;
-
-	UINT8 *source;
-	UINT8 *brightmap;
-	UINT8 *colormap;
-	UINT8 *fullbright;
-	UINT8 *dest;
-
-	UINT8 val;
-
-	double startz, startu, startv;
-	double izstep, uzstep, vzstep;
-	double endz, endu, endv;
-	UINT32 stepu, stepv;
-	UINT32 bit;
-	INT32 tiltlighting[MAXVIDWIDTH];
-
-	iz = ds->szp.z + ds->szp.y*(centery-ds->y) + ds->szp.x*(ds->x1-centerx);
-
-	// Lighting is simple. It's just linear interpolation from start to end
-	{
-		float planelightfloat = PLANELIGHTFLOAT;
-		float lightstart, lightend;
-
-		lightend = (iz + ds->szp.x*width) * planelightfloat;
-		lightstart = iz * planelightfloat;
-
-		R_CalcTiltedLighting(tiltlighting, ds->x1, ds->x2, FLOAT_TO_FIXED(lightstart), FLOAT_TO_FIXED(lightend));
-		//CONS_Printf("tilted lighting %f to %f (foc %f)\n", lightstart, lightend, focallengthf);
-	}
-
-	uz = ds->sup.z + ds->sup.y*(centery-ds->y) + ds->sup.x*(ds->x1-centerx);
-	vz = ds->svp.z + ds->svp.y*(centery-ds->y) + ds->svp.x*(ds->x1-centerx);
-
-	dest = ylookup[ds->y] + columnofs[ds->x1];
-
-	source = ds->source;
-	brightmap = ds->brightmap;
-	//colormap = ds_colormap;
-	fullbright = ds->fullbright;
-
-#if 0	// The "perfect" reference version of this routine. Pretty slow.
-		// Use it only to see how things are supposed to look.
-	i = 0;
-	do
-	{
-		double z = 1.f/iz;
-		u = (INT64)(uz*z);
-		v = (INT64)(vz*z);
-
-		bit = ((v >> nflatyshift) & nflatmask) | (u >> nflatxshift);
-		val = source[bit];
-		if (val != TRANSPARENTPIXEL)
-		{
-			if (brightmap != NULL && brightmap[bit] == BRIGHTPIXEL)
-			{
-				*dest = fullbright[val];
-			}
-			else
-			{
-				colormap = planezlight[tiltlighting[ds_x1]] + (ds_colormap - colormaps);
-				*dest = colormap[val];
-			}
-		}
-
-		dest++;
-		ds_x1++;
-		iz += ds_szp->x;
-		uz += ds_sup->x;
-		vz += ds_svp->x;
-	} while (--width >= 0);
-#else
-	startz = 1.f/iz;
-	startu = uz*startz;
-	startv = vz*startz;
-
-	izstep = ds->szp.x * SPANSIZE;
-	uzstep = ds->sup.x * SPANSIZE;
-	vzstep = ds->svp.x * SPANSIZE;
-	//x1 = 0;
-	width++;
-
-	while (width >= SPANSIZE)
-	{
-		iz += izstep;
-		uz += uzstep;
-		vz += vzstep;
-
-		endz = 1.f/iz;
-		endu = uz*endz;
-		endv = vz*endz;
-		stepu = (INT64)((endu - startu) * INVSPAN);
-		stepv = (INT64)((endv - startv) * INVSPAN);
-		u = (INT64)(startu);
-		v = (INT64)(startv);
-
-		for (i = SPANSIZE-1; i >= 0; i--)
-		{
-			bit = ((v >> ds->nflatyshift) & ds->nflatmask) | (u >> ds->nflatxshift);
-			val = source[bit];
-			if (val != TRANSPARENTPIXEL)
-			{
-				if (brightmap != NULL && brightmap[bit] == BRIGHTPIXEL)
-				{
-					*dest = fullbright[val];
-				}
-				else
-				{
-					colormap = ds->planezlight[tiltlighting[ds->x1]] + (ds->colormap - colormaps);
-					*dest = colormap[val];
-				}
-			}
-			dest++;
-			ds->x1++;
-			u += stepu;
-			v += stepv;
-		}
-		startu = endu;
-		startv = endv;
-		width -= SPANSIZE;
-	}
-	if (width > 0)
-	{
-		if (width == 1)
-		{
-			u = (INT64)(startu);
-			v = (INT64)(startv);
-			bit = ((v >> ds->nflatyshift) & ds->nflatmask) | (u >> ds->nflatxshift);
-			val = source[bit];
-			if (val != TRANSPARENTPIXEL)
-			{
-				if (brightmap != NULL && brightmap[bit] == BRIGHTPIXEL)
-				{
-					*dest = fullbright[val];
-				}
-				else
-				{
-					colormap = ds->planezlight[tiltlighting[ds->x1]] + (ds->colormap - colormaps);
-					*dest = colormap[val];
-				}
-				ds->x1++;
-			}
-		}
-		else
-		{
-			double left = width;
-			iz += ds->szp.x * left;
-			uz += ds->sup.x * left;
-			vz += ds->svp.x * left;
-
-			endz = 1.f/iz;
-			endu = uz*endz;
-			endv = vz*endz;
-			left = 1.f/left;
-			stepu = (INT64)((endu - startu) * left);
-			stepv = (INT64)((endv - startv) * left);
-			u = (INT64)(startu);
-			v = (INT64)(startv);
-
-			for (; width != 0; width--)
-			{
-				bit = ((v >> ds->nflatyshift) & ds->nflatmask) | (u >> ds->nflatxshift);
-				val = source[bit];
-				if (val != TRANSPARENTPIXEL)
-				{
-					if (brightmap != NULL && brightmap[bit] == BRIGHTPIXEL)
-					{
-						*dest = fullbright[val];
-					}
-					else
-					{
-						colormap = ds->planezlight[tiltlighting[ds->x1]] + (ds->colormap - colormaps);
-						*dest = colormap[val];
-					}
-				}
-				dest++;
-				ds->x1++;
-				u += stepu;
-				v += stepv;
-			}
-		}
-	}
-#endif
-}
-
-/**	\brief The R_DrawSplat_8 function
-	Just like R_DrawSpan_8, but skips transparent pixels.
-*/
-void R_DrawSplat_8 (drawspandata_t* ds)
-{
-	fixed_t xposition;
-	fixed_t yposition;
-	fixed_t xstep, ystep;
-	UINT32 bit;
-
-	UINT8 *source;
-	UINT8 *brightmap;
-	UINT8 *colormap;
-	UINT8 *fullbright;
-	UINT8 *dest;
-	const UINT8 *deststop = screens[0] + vid.rowbytes * vid.height;
-
-	size_t count = (ds->x2 - ds->x1 + 1);
-	size_t i;
-	UINT32 val;
-
-	xposition = ds->xfrac; yposition = ds->yfrac;
-	xstep = ds->xstep; ystep = ds->ystep;
-
-	// SoM: we only need 6 bits for the integer part (0 thru 63) so the rest
-	// can be used for the fraction part. This allows calculation of the memory address in the
-	// texture with two shifts, an OR and one AND. (see below)
-	// for texture sizes > 64 the amount of precision we can allow will decrease, but only by one
-	// bit per power of two (obviously)
-	// Ok, because I was able to eliminate the variable spot below, this function is now FASTER
-	// than the original span renderer. Whodathunkit?
-	xposition <<= ds->nflatshiftup; yposition <<= ds->nflatshiftup;
-	xstep <<= ds->nflatshiftup; ystep <<= ds->nflatshiftup;
-
-	source = ds->source;
-	brightmap = ds->brightmap;
-	colormap = ds->colormap;
-	fullbright = ds->fullbright;
-	dest = ylookup[ds->y] + columnofs[ds->x1];
-
-	while (count >= 8)
-	{
-		// SoM: Why didn't I see this earlier? the spot variable is a waste now because we don't
-		// have the uber complicated math to calculate it now, so that was a memory write we didn't
-		// need!
-		for (i = 0; i < 8; i++)
-		{
-			bit = (((UINT32)yposition >> ds->nflatyshift) & ds->nflatmask) | ((UINT32)xposition >> ds->nflatxshift);
-			bit &= MAXFLATBYTES;
-			val = source[bit];
-			if (val != TRANSPARENTPIXEL)
-			{
-				if (brightmap != NULL && brightmap[bit] == BRIGHTPIXEL)
-				{
-					dest[i] = fullbright[val];
-				}
-				else
-				{
-					dest[i] = colormap[val];
-				}
-			}
-			xposition += xstep;
-			yposition += ystep;
-		}
-
-		dest += 8;
-		count -= 8;
-	}
-	while (count-- && dest <= deststop)
-	{
-		bit = (((UINT32)yposition >> ds->nflatyshift) & ds->nflatmask) | ((UINT32)xposition >> ds->nflatxshift);
-		val = source[bit];
-		if (val != TRANSPARENTPIXEL)
-		{
-			if (brightmap != NULL && brightmap[bit] == BRIGHTPIXEL)
-			{
-				*dest = fullbright[val];
-			}
-			else
-			{
-				*dest = colormap[val];
-			}
-		}
-		dest++;
-		xposition += xstep;
-		yposition += ystep;
-	}
-}
-
-/**	\brief The R_DrawTranslucentSplat_8 function
-	Just like R_DrawSplat_8, but is translucent!
-*/
-void R_DrawTranslucentSplat_8 (drawspandata_t* ds)
-{
-	fixed_t xposition;
-	fixed_t yposition;
-	fixed_t xstep, ystep;
-	UINT32 bit;
-
-	UINT8 *source;
-	UINT8 *brightmap;
-	UINT8 *colormap;
-	UINT8 *fullbright;
-	UINT8 *dest;
-	const UINT8 *deststop = screens[0] + vid.rowbytes * vid.height;
-
-	size_t count = (ds->x2 - ds->x1 + 1);
-	size_t i;
-	UINT32 val;
-
-	xposition = ds->xfrac; yposition = ds->yfrac;
-	xstep = ds->xstep; ystep = ds->ystep;
-
-	// SoM: we only need 6 bits for the integer part (0 thru 63) so the rest
-	// can be used for the fraction part. This allows calculation of the memory address in the
-	// texture with two shifts, an OR and one AND. (see below)
-	// for texture sizes > 64 the amount of precision we can allow will decrease, but only by one
-	// bit per power of two (obviously)
-	// Ok, because I was able to eliminate the variable spot below, this function is now FASTER
-	// than the original span renderer. Whodathunkit?
-	xposition <<= ds->nflatshiftup; yposition <<= ds->nflatshiftup;
-	xstep <<= ds->nflatshiftup; ystep <<= ds->nflatshiftup;
-
-	source = ds->source;
-	brightmap = ds->brightmap;
-	colormap = ds->colormap;
-	fullbright = ds->fullbright;
-	dest = ylookup[ds->y] + columnofs[ds->x1];
-
-	while (count >= 8)
-	{
-		// SoM: Why didn't I see this earlier? the spot variable is a waste now because we don't
-		// have the uber complicated math to calculate it now, so that was a memory write we didn't
-		// need!
-		for (i = 0; i < 8; i++)
-		{
-			bit = (((UINT32)yposition >> ds->nflatyshift) & ds->nflatmask) | ((UINT32)xposition >> ds->nflatxshift);
-			val = source[bit];
-			if (val != TRANSPARENTPIXEL)
-			{
-				if (brightmap != NULL && brightmap[bit] == BRIGHTPIXEL)
-				{
-					dest[i] = *(ds->transmap + (fullbright[val] << 8) + dest[i]);
-				}
-				else
-				{
-					dest[i] = *(ds->transmap + (colormap[val] << 8) + dest[i]);
-				}
-
-			}
-			xposition += xstep;
-			yposition += ystep;
-		}
-
-		dest += 8;
-		count -= 8;
-	}
-	while (count-- && dest <= deststop)
-	{
-		bit = (((UINT32)yposition >> ds->nflatyshift) & ds->nflatmask) | ((UINT32)xposition >> ds->nflatxshift);
-		val = source[bit];
-		if (val != TRANSPARENTPIXEL)
-		{
-			if (brightmap != NULL && brightmap[bit] == BRIGHTPIXEL)
-			{
-				*dest = *(ds->transmap + (fullbright[val] << 8) + *dest);
-			}
-			else
-			{
-				*dest = *(ds->transmap + (colormap[val] << 8) + *dest);
-			}
-
-		}
-		dest++;
-		xposition += xstep;
-		yposition += ystep;
-	}
-}
-
-/**	\brief The R_DrawFloorSprite_8 function
-	Just like R_DrawSplat_8, but for floor sprites.
-*/
-void R_DrawFloorSprite_8 (drawspandata_t* ds)
-{
-	fixed_t xposition;
-	fixed_t yposition;
-	fixed_t xstep, ystep;
-	UINT32 bit;
-
-	UINT16 *source;
-	UINT16 *brightmap;
-	UINT8 *colormap;
-	UINT8 *fullbright;
-	UINT8 *translation;
-	UINT8 *dest;
-	const UINT8 *deststop = screens[0] + vid.rowbytes * vid.height;
-
-	size_t count = (ds->x2 - ds->x1 + 1);
-	size_t i;
-	UINT32 val;
-
-	xposition = ds->xfrac; yposition = ds->yfrac;
-	xstep = ds->xstep; ystep = ds->ystep;
-
-	// SoM: we only need 6 bits for the integer part (0 thru 63) so the rest
-	// can be used for the fraction part. This allows calculation of the memory address in the
-	// texture with two shifts, an OR and one AND. (see below)
-	// for texture sizes > 64 the amount of precision we can allow will decrease, but only by one
-	// bit per power of two (obviously)
-	// Ok, because I was able to eliminate the variable spot below, this function is now FASTER
-	// than the original span renderer. Whodathunkit?
-	xposition <<= ds->nflatshiftup; yposition <<= ds->nflatshiftup;
-	xstep <<= ds->nflatshiftup; ystep <<= ds->nflatshiftup;
-
-	source = (UINT16 *)ds->source;
-	brightmap = (UINT16 *)ds->brightmap;
-	colormap = ds->colormap;
-	fullbright = ds->fullbright;
-	translation = ds->translation;
-	dest = ylookup[ds->y] + columnofs[ds->x1];
-
-	while (count >= 8)
-	{
-		// SoM: Why didn't I see this earlier? the spot variable is a waste now because we don't
-		// have the uber complicated math to calculate it now, so that was a memory write we didn't
-		// need!
-		for (i = 0; i < 8; i++)
-		{
-			bit = (((UINT32)yposition >> ds->nflatyshift) & ds->nflatmask) | ((UINT32)xposition >> ds->nflatxshift);
-			val = source[bit];
-			if (val & 0xFF00)
-			{
-				if (brightmap != NULL && brightmap[bit] == BRIGHTPIXEL)
-				{
-					dest[i] = fullbright[translation[val & 0xFF]];
-				}
-				else
-				{
-					dest[i] = colormap[translation[val & 0xFF]];
-				}
-			}
-			xposition += xstep;
-			yposition += ystep;
-		}
-
-		dest += 8;
-		count -= 8;
-	}
-	while (count-- && dest <= deststop)
-	{
-		bit = (((UINT32)yposition >> ds->nflatyshift) & ds->nflatmask) | ((UINT32)xposition >> ds->nflatxshift);
-		val = source[bit];
-		if (val & 0xFF00)
-		{
-			if (brightmap != NULL && brightmap[bit] == BRIGHTPIXEL)
-			{
-				*dest = fullbright[translation[val & 0xFF]];
-			}
-			else
-			{
-				*dest = colormap[translation[val & 0xFF]];
-			}
-		}
-		dest++;
-		xposition += xstep;
-		yposition += ystep;
-	}
-}
-
-/**	\brief The R_DrawTranslucentFloorSplat_8 function
-	Just like R_DrawFloorSprite_8, but is translucent!
-*/
-void R_DrawTranslucentFloorSprite_8 (drawspandata_t* ds)
-{
-	fixed_t xposition;
-	fixed_t yposition;
-	fixed_t xstep, ystep;
-	UINT32 bit;
-
-	UINT16 *source;
-	UINT16 *brightmap;
-	UINT8 *colormap;
-	UINT8 *fullbright;
-	UINT8 *translation;
-	UINT8 *dest;
-	const UINT8 *deststop = screens[0] + vid.rowbytes * vid.height;
-
-	size_t count = (ds->x2 - ds->x1 + 1);
-	size_t i;
-	UINT32 val;
-
-	xposition = ds->xfrac; yposition = ds->yfrac;
-	xstep = ds->xstep; ystep = ds->ystep;
-
-	// SoM: we only need 6 bits for the integer part (0 thru 63) so the rest
-	// can be used for the fraction part. This allows calculation of the memory address in the
-	// texture with two shifts, an OR and one AND. (see below)
-	// for texture sizes > 64 the amount of precision we can allow will decrease, but only by one
-	// bit per power of two (obviously)
-	// Ok, because I was able to eliminate the variable spot below, this function is now FASTER
-	// than the original span renderer. Whodathunkit?
-	xposition <<= ds->nflatshiftup; yposition <<= ds->nflatshiftup;
-	xstep <<= ds->nflatshiftup; ystep <<= ds->nflatshiftup;
-
-	source = (UINT16 *)ds->source;
-	brightmap = (UINT16 *)ds->brightmap;
-	colormap = ds->colormap;
-	fullbright = ds->fullbright;
-	translation = ds->translation;
-	dest = ylookup[ds->y] + columnofs[ds->x1];
-
-	while (count >= 8)
-	{
-		// SoM: Why didn't I see this earlier? the spot variable is a waste now because we don't
-		// have the uber complicated math to calculate it now, so that was a memory write we didn't
-		// need!
-		for (i = 0; i < 8; i++)
-		{
-			bit = (((UINT32)yposition >> ds->nflatyshift) & ds->nflatmask) | ((UINT32)xposition >> ds->nflatxshift);
-			val = source[bit];
-			if (val & 0xFF00)
-			{
-				if (brightmap != NULL && brightmap[bit] == BRIGHTPIXEL)
-				{
-					dest[i] = *(ds->transmap + (fullbright[translation[val & 0xFF]] << 8) + dest[i]);
-				}
-				else
-				{
-					dest[i] = *(ds->transmap + (colormap[translation[val & 0xFF]] << 8) + dest[i]);
-				}
-			}
-			xposition += xstep;
-			yposition += ystep;
-		}
-
-		dest += 8;
-		count -= 8;
-	}
-	while (count-- && dest <= deststop)
-	{
-		bit = (((UINT32)yposition >> ds->nflatyshift) & ds->nflatmask) | ((UINT32)xposition >> ds->nflatxshift);
-		val = source[bit];
-		if (val & 0xFF00)
-		{
-			if (brightmap != NULL && brightmap[bit] == BRIGHTPIXEL)
-			{
-				*dest = *(ds->transmap + (fullbright[translation[val & 0xFF]] << 8) + *dest);
-			}
-			else
-			{
-				*dest = *(ds->transmap + (colormap[translation[val & 0xFF]] << 8) + *dest);
-			}
-		}
-		dest++;
-		xposition += xstep;
-		yposition += ystep;
-	}
-}
-
-/**	\brief The R_DrawTiltedFloorSprite_8 function
-	Draws a tilted floor sprite.
-*/
-void R_DrawTiltedFloorSprite_8(drawspandata_t* ds)
-{
-	// x1, x2 = ds_x1, ds_x2
-	int width = ds->x2 - ds->x1;
-	double iz, uz, vz;
-	UINT32 u, v;
-	int i;
-
-	UINT16 *source;
-	UINT16 *brightmap;
-	UINT8 *colormap;
-	UINT8 *fullbright;
-	UINT8 *translation;
-	UINT8 *dest;
-	UINT16 val;
-
-	double startz, startu, startv;
-	double izstep, uzstep, vzstep;
-	double endz, endu, endv;
-	UINT32 stepu, stepv;
-	UINT32 bit;
-
-	iz = ds->szp.z + ds->szp.y*(centery-ds->y) + ds->szp.x*(ds->x1-centerx);
-	uz = ds->sup.z + ds->sup.y*(centery-ds->y) + ds->sup.x*(ds->x1-centerx);
-	vz = ds->svp.z + ds->svp.y*(centery-ds->y) + ds->svp.x*(ds->x1-centerx);
-
-	dest = ylookup[ds->y] + columnofs[ds->x1];
-	source = (UINT16 *)ds->source;
-	brightmap = (UINT16 *)ds->brightmap;
-	colormap = ds->colormap;
-	fullbright = ds->fullbright;
-	translation = ds->translation;
-
-	startz = 1.f/iz;
-	startu = uz*startz;
-	startv = vz*startz;
-
-	izstep = ds->szp.x * SPANSIZE;
-	uzstep = ds->sup.x * SPANSIZE;
-	vzstep = ds->svp.x * SPANSIZE;
-	//x1 = 0;
-	width++;
-
-	while (width >= SPANSIZE)
-	{
-		iz += izstep;
-		uz += uzstep;
-		vz += vzstep;
-
-		endz = 1.f/iz;
-		endu = uz*endz;
-		endv = vz*endz;
-		stepu = (INT64)((endu - startu) * INVSPAN);
-		stepv = (INT64)((endv - startv) * INVSPAN);
-		u = (INT64)(startu);
-		v = (INT64)(startv);
-
-		for (i = SPANSIZE-1; i >= 0; i--)
-		{
-			bit = ((v >> ds->nflatyshift) & ds->nflatmask) | (u >> ds->nflatxshift);
-			val = source[bit];
-			if (val & 0xFF00)
-			{
-				if (brightmap != NULL && brightmap[bit] == BRIGHTPIXEL)
-				{
-					*dest = fullbright[translation[val & 0xFF]];
-				}
-				else
-				{
-					*dest = colormap[translation[val & 0xFF]];
-				}
-			}
-			dest++;
-
-			u += stepu;
-			v += stepv;
-		}
-		startu = endu;
-		startv = endv;
-		width -= SPANSIZE;
-	}
-	if (width > 0)
-	{
-		if (width == 1)
-		{
-			u = (INT64)(startu);
-			v = (INT64)(startv);
-			bit = ((v >> ds->nflatyshift) & ds->nflatmask) | (u >> ds->nflatxshift);
-			val = source[bit];
-			if (val & 0xFF00)
-			{
-				if (brightmap != NULL && brightmap[bit] == BRIGHTPIXEL)
-				{
-					*dest = fullbright[translation[val & 0xFF]];
-				}
-				else
-				{
-					*dest = colormap[translation[val & 0xFF]];
-				}
-			}
-		}
-		else
-		{
-			double left = width;
-			iz += ds->szp.x * left;
-			uz += ds->sup.x * left;
-			vz += ds->svp.x * left;
-
-			endz = 1.f/iz;
-			endu = uz*endz;
-			endv = vz*endz;
-			left = 1.f/left;
-			stepu = (INT64)((endu - startu) * left);
-			stepv = (INT64)((endv - startv) * left);
-			u = (INT64)(startu);
-			v = (INT64)(startv);
-
-			for (; width != 0; width--)
-			{
-				bit = ((v >> ds->nflatyshift) & ds->nflatmask) | (u >> ds->nflatxshift);
-				val = source[bit];
-				if (val & 0xFF00)
-				{
-					if (brightmap != NULL && brightmap[bit] == BRIGHTPIXEL)
-					{
-						*dest = fullbright[translation[val & 0xFF]];
-					}
-					else
-					{
-						*dest = colormap[translation[val & 0xFF]];
-					}
-				}
-				dest++;
-
-				u += stepu;
-				v += stepv;
-			}
-		}
-	}
-}
-
-/**	\brief The R_DrawTiltedTranslucentFloorSprite_8 function
-	Draws a tilted, translucent, floor sprite.
-*/
-void R_DrawTiltedTranslucentFloorSprite_8(drawspandata_t* ds)
-{
-	// x1, x2 = ds_x1, ds_x2
-	int width = ds->x2 - ds->x1;
-	double iz, uz, vz;
-	UINT32 u, v;
-	int i;
-
-	UINT16 *source;
-	UINT16 *brightmap;
-	UINT8 *colormap;
-	UINT8 *fullbright;
-	UINT8 *translation;
-	UINT8 *dest;
-	UINT16 val;
-
-	double startz, startu, startv;
-	double izstep, uzstep, vzstep;
-	double endz, endu, endv;
-	UINT32 stepu, stepv;
-	UINT32 bit;
-
-	const INT32 nflatxshift = ds->nflatxshift;
-	const INT32 nflatyshift = ds->nflatyshift;
-	const INT32 nflatmask = ds->nflatmask;
-	UINT8 *transmap = ds->transmap;
-
-	iz = ds->szp.z + ds->szp.y*(centery-ds->y) + ds->szp.x*(ds->x1-centerx);
-	uz = ds->sup.z + ds->sup.y*(centery-ds->y) + ds->sup.x*(ds->x1-centerx);
-	vz = ds->svp.z + ds->svp.y*(centery-ds->y) + ds->svp.x*(ds->x1-centerx);
-
-	dest = ylookup[ds->y] + columnofs[ds->x1];
-	source = (UINT16 *)ds->source;
-	brightmap = (UINT16 *)ds->brightmap;
-	colormap = ds->colormap;
-	fullbright = ds->fullbright;
-	translation = ds->translation;
-
-	startz = 1.f/iz;
-	startu = uz*startz;
-	startv = vz*startz;
-
-	izstep = ds->szp.x * SPANSIZE;
-	uzstep = ds->sup.x * SPANSIZE;
-	vzstep = ds->svp.x * SPANSIZE;
-	//x1 = 0;
-	width++;
-
-	while (width >= SPANSIZE)
-	{
-		iz += izstep;
-		uz += uzstep;
-		vz += vzstep;
-
-		endz = 1.f/iz;
-		endu = uz*endz;
-		endv = vz*endz;
-		stepu = (INT64)((endu - startu) * INVSPAN);
-		stepv = (INT64)((endv - startv) * INVSPAN);
-		u = (INT64)(startu);
-		v = (INT64)(startv);
-
-		for (i = 0; i < SPANSIZE; i++)
-		{
-			bit = (((v + stepv * i) >> nflatyshift) & nflatmask) | ((u + stepu * i) >> nflatxshift);
-			val = source[bit];
-			if (val & 0xFF00)
-			{
-				if (brightmap != NULL && brightmap[bit] == BRIGHTPIXEL)
-				{
-					dest[i] = *(transmap + (fullbright[translation[val & 0xFF]] << 8) + dest[i]);
-				}
-				else
-				{
-					dest[i] = *(transmap + (colormap[translation[val & 0xFF]] << 8) + dest[i]);
-				}
-			}
-		}
-		startu = endu;
-		startv = endv;
-		width -= SPANSIZE;
-	}
-	if (width > 0)
-	{
-		if (width == 1)
-		{
-			u = (INT64)(startu);
-			v = (INT64)(startv);
-			bit = ((v >> nflatyshift) & nflatmask) | (u >> nflatxshift);
-			val = source[bit];
-			if (val & 0xFF00)
-			{
-				if (brightmap != NULL && brightmap[bit] == BRIGHTPIXEL)
-				{
-					*dest = *(transmap + (fullbright[translation[val & 0xFF]] << 8) + *dest);
-				}
-				else
-				{
-					*dest = *(transmap + (colormap[translation[val & 0xFF]] << 8) + *dest);
-				}
-			}
-		}
-		else
-		{
-			double left = width;
-			iz += ds->szp.x * left;
-			uz += ds->sup.x * left;
-			vz += ds->svp.x * left;
-
-			endz = 1.f/iz;
-			endu = uz*endz;
-			endv = vz*endz;
-			left = 1.f/left;
-			stepu = (INT64)((endu - startu) * left);
-			stepv = (INT64)((endv - startv) * left);
-			u = (INT64)(startu);
-			v = (INT64)(startv);
-
-			for (; width != 0; width--)
-			{
-				bit = ((v >> nflatyshift) & nflatmask) | (u >> nflatxshift);
-				val = source[bit];
-				if (val & 0xFF00)
-				{
-					if (brightmap != NULL && brightmap[bit] == BRIGHTPIXEL)
-					{
-						*dest = *(transmap + (fullbright[translation[val & 0xFF]] << 8) + *dest);
-					}
-					else
-					{
-						*dest = *(transmap + (colormap[translation[val & 0xFF]] << 8) + *dest);
-					}
-				}
-				dest++;
-
-				u += stepu;
-				v += stepv;
-			}
-		}
-	}
-}
-
-/**	\brief The R_DrawTranslucentSpan_8 function
-	Draws the actual span with translucency.
-*/
-void R_DrawTranslucentSpan_8 (drawspandata_t* ds)
-{
-	fixed_t xposition;
-	fixed_t yposition;
-	fixed_t xstep, ystep;
-	UINT32 bit;
-
-	UINT8 *source;
-	UINT8 *brightmap;
-	UINT8 *colormap;
-	UINT8 *fullbright;
-	UINT8 *dest;
-	const UINT8 *deststop = screens[0] + vid.rowbytes * vid.height;
-
-	size_t count = (ds->x2 - ds->x1 + 1);
-	size_t i;
-
-	xposition = ds->xfrac; yposition = ds->yfrac;
-	xstep = ds->xstep; ystep = ds->ystep;
-
-	// SoM: we only need 6 bits for the integer part (0 thru 63) so the rest
-	// can be used for the fraction part. This allows calculation of the memory address in the
-	// texture with two shifts, an OR and one AND. (see below)
-	// for texture sizes > 64 the amount of precision we can allow will decrease, but only by one
-	// bit per power of two (obviously)
-	// Ok, because I was able to eliminate the variable spot below, this function is now FASTER
-	// than the original span renderer. Whodathunkit?
-	xposition <<= ds->nflatshiftup; yposition <<= ds->nflatshiftup;
-	xstep <<= ds->nflatshiftup; ystep <<= ds->nflatshiftup;
-
-	source = ds->source;
-	brightmap = ds->brightmap;
-	colormap = ds->colormap;
-	fullbright = ds->fullbright;
-	dest = ylookup[ds->y] + columnofs[ds->x1];
-
-	while (count >= 8)
-	{
-		// SoM: Why didn't I see this earlier? the spot variable is a waste now because we don't
-		// have the uber complicated math to calculate it now, so that was a memory write we didn't
-		// need!
-		for (i = 0; i < 8; i++)
-		{
-			bit = (((UINT32)yposition >> ds->nflatyshift) & ds->nflatmask) | ((UINT32)xposition >> ds->nflatxshift);
-			if (brightmap != NULL && brightmap[bit] == BRIGHTPIXEL)
-			{
-				dest[i] = *(ds->transmap + (fullbright[source[bit]] << 8) + dest[i]);
-			}
-			else
-			{
-				dest[i] = *(ds->transmap + (colormap[source[bit]] << 8) + dest[i]);
-			}
-			xposition += xstep;
-			yposition += ystep;
-		}
-
-		dest += 8;
-		count -= 8;
-	}
-	while (count-- && dest <= deststop)
-	{
-		bit = (((UINT32)yposition >> ds->nflatyshift) & ds->nflatmask) | ((UINT32)xposition >> ds->nflatxshift);
-		if (brightmap != NULL && brightmap[bit] == BRIGHTPIXEL)
-		{
-			*dest = *(ds->transmap + (fullbright[source[bit]] << 8) + *dest);
-		}
-		else
-		{
-			*dest = *(ds->transmap + (colormap[source[bit]] << 8) + *dest);
-		}
-		dest++;
-		xposition += xstep;
-		yposition += ystep;
-	}
-}
-
-void R_DrawTranslucentWaterSpan_8(drawspandata_t* ds)
-{
-	UINT32 xposition;
-	UINT32 yposition;
-	UINT32 xstep, ystep;
-	UINT32 bit;
-
-	UINT8 *source;
-	UINT8 *brightmap;
-	UINT8 *colormap;
-	UINT8 *fullbright;
-	UINT8 *dest;
-	UINT8 *dsrc;
-
-	size_t count;
-	size_t i;
-
-	// SoM: we only need 6 bits for the integer part (0 thru 63) so the rest
-	// can be used for the fraction part. This allows calculation of the memory address in the
-	// texture with two shifts, an OR and one AND. (see below)
-	// for texture sizes > 64 the amount of precision we can allow will decrease, but only by one
-	// bit per power of two (obviously)
-	// Ok, because I was able to eliminate the variable spot below, this function is now FASTER
-	// than the original span renderer. Whodathunkit?
-	xposition = ds->xfrac << ds->nflatshiftup; yposition = (ds->yfrac + ds->waterofs) << ds->nflatshiftup;
-	xstep = ds->xstep << ds->nflatshiftup; ystep = ds->ystep << ds->nflatshiftup;
-
-	source = ds->source;
-	brightmap = ds->brightmap;
-	colormap = ds->colormap;
-	fullbright = ds->fullbright;
-	dest = ylookup[ds->y] + columnofs[ds->x1];
-	dsrc = screens[1] + (ds->y+ds->bgofs)*vid.width + ds->x1;
-	count = ds->x2 - ds->x1 + 1;
-
-	while (count >= 8)
-	{
-		// SoM: Why didn't I see this earlier? the spot variable is a waste now because we don't
-		// have the uber complicated math to calculate it now, so that was a memory write we didn't
-		// need!
-		for (i = 0; i < 8; i++)
-		{
-			bit = ((yposition >> ds->nflatyshift) & ds->nflatmask) | (xposition >> ds->nflatxshift);
-			if (brightmap != NULL && brightmap[bit] == BRIGHTPIXEL)
-			{
-				dest[i] = fullbright[*(ds->transmap + (source[bit] << 8) + dsrc[i])];
-			}
-			else
-			{
-				dest[i] = colormap[*(ds->transmap + (source[bit] << 8) + dsrc[i])];
-			}
-			xposition += xstep;
-			yposition += ystep;
-		}
-
-		dest += 8;
-		dsrc += 8;
-		count -= 8;
-	}
-	while (count--)
-	{
-		bit = ((yposition >> ds->nflatyshift) & ds->nflatmask) | (xposition >> ds->nflatxshift);
-		if (brightmap != NULL && brightmap[bit] == BRIGHTPIXEL)
-		{
-			*dest = fullbright[*(ds->transmap + (source[bit] << 8) + *dsrc)];
-		}
-		else
-		{
-			*dest = colormap[*(ds->transmap + (source[bit] << 8) + *dsrc)];
-		}
-		dest++;
-		dsrc++;
-		xposition += xstep;
-		yposition += ystep;
-	}
-}
-
-/**	\brief The R_DrawFogSpan_8 function
-	Draws the actual span with fogging.
-*/
-void R_DrawFogSpan_8(drawspandata_t* ds)
-{
-	UINT8 *colormap;
-	UINT8 *dest;
-
-	size_t count;
-
-	colormap = ds->colormap;
-	//dest = ylookup[ds_y] + columnofs[ds_x1];
-	dest = &topleft[ds->y *vid.width + ds->x1];
-
-	count = ds->x2 - ds->x1 + 1;
-
-	while (count >= 4)
-	{
-		dest[0] = colormap[dest[0]];
-		dest[1] = colormap[dest[1]];
-		dest[2] = colormap[dest[2]];
-		dest[3] = colormap[dest[3]];
-
-		dest += 4;
-		count -= 4;
-	}
-
-	while (count--)
-	{
-		*dest = colormap[*dest];
-		dest++;
-	}
-}
-
-/**	\brief The R_DrawFogColumn_8 function
-	Fog wall.
-*/
-void R_DrawFogColumn_8(drawcolumndata_t* dc)
-{
-	INT32 count;
-	UINT8 *dest;
-
-	count = dc->yh - dc->yl;
-
-	// Zero length, column does not exceed a pixel.
-	if (count < 0)
-		return;
-
-#ifdef RANGECHECK
-	if ((unsigned)dc->x >= (unsigned)vid.width || dc->yl < 0 || dc->yh >= vid.height)
-		I_Error("R_DrawFogColumn_8: %d to %d at %d", dc->yl, dc->yh, dc->x);
-#endif
-
-	// Framebuffer destination address.
-	// Use ylookup LUT to avoid multiply with ScreenWidth.
-	// Use columnofs LUT for subwindows?
-	//dest = ylookup[dc_yl] + columnofs[dc_x];
-	dest = &topleft[dc->yl*vid.width + dc->x];
-
-	// Determine scaling, which is the only mapping to be done.
-	do
-	{
-		// Simple. Apply the colormap to what's already on the screen.
-		*dest = dc->colormap[*dest];
-		dest += vid.width;
-	} while (count--);
-}
-
-/**	\brief The R_DrawShadeColumn_8 function
-	This is for 3D floors that cast shadows on walls.
-
-	This function just cuts the column up into sections and calls R_DrawColumn_8
-*/
-void R_DrawColumnShadowed_8(drawcolumndata_t* dc)
-{
-	INT32 count, realyh, i, height, bheight = 0, solid = 0;
-
-	realyh = dc->yh;
-
-	count = dc->yh - dc->yl;
-
-	// Zero length, column does not exceed a pixel.
-	if (count < 0)
-		return;
-
-#ifdef RANGECHECK
-	if ((unsigned)dc->x >= (unsigned)vid.width || dc->yl < 0 || dc->yh >= vid.height)
-		I_Error("R_DrawColumnShadowed_8: %d to %d at %d", dc->yl, dc->yh, dc->x);
-#endif
-
-	// This runs through the lightlist from top to bottom and cuts up the column accordingly.
-	for (i = 0; i < dc->numlights; i++)
-	{
-		// If the height of the light is above the column, get the colormap
-		// anyway because the lighting of the top should be affected.
-		solid = dc->lightlist[i].flags & FOF_CUTSOLIDS;
-
-		height = dc->lightlist[i].height >> LIGHTSCALESHIFT;
-		if (solid)
-		{
-			bheight = dc->lightlist[i].botheight >> LIGHTSCALESHIFT;
-			if (bheight < height)
-			{
-				// confounded slopes sometimes allow partial invertedness,
-				// even including cases where the top and bottom heights
-				// should actually be the same!
-				// swap the height values as a workaround for this quirk
-				INT32 temp = height;
-				height = bheight;
-				bheight = temp;
-			}
-		}
-		if (height <= dc->yl)
-		{
-			dc->colormap = dc->lightlist[i].rcolormap;
-			dc->fullbright = colormaps;
-			if (encoremap)
-			{
-				dc->colormap += COLORMAP_REMAPOFFSET;
-				dc->fullbright += COLORMAP_REMAPOFFSET;
-			}
-			if (solid && dc->yl < bheight)
-				dc->yl = bheight;
-			continue;
-		}
-		// Found a break in the column!
-		dc->yh = height;
-
-		if (dc->yh > realyh)
-			dc->yh = realyh;
-		(colfuncs[BASEDRAWFUNC])(dc);		// R_DrawColumn_8 for the appropriate architecture
-		if (solid)
-			dc->yl = bheight;
-		else
-			dc->yl = dc->yh + 1;
-
-		dc->colormap = dc->lightlist[i].rcolormap;
-		dc->fullbright = colormaps;
-		if (encoremap)
-		{
-			dc->colormap += COLORMAP_REMAPOFFSET;
-			dc->fullbright += COLORMAP_REMAPOFFSET;
-		}
-	}
-	dc->yh = realyh;
-	if (dc->yl <= realyh)
-		(colfuncs[BASEDRAWFUNC])(dc);		// R_DrawWallColumn_8 for the appropriate architecture
-}
diff --git a/src/r_draw8_flat.c b/src/r_draw8_flat.c
deleted file mode 100644
index f6669b069..000000000
--- a/src/r_draw8_flat.c
+++ /dev/null
@@ -1,80 +0,0 @@
-// SONIC ROBO BLAST 2
-//-----------------------------------------------------------------------------
-// Copyright (C) 1998-2000 by DooM Legacy Team.
-// Copyright (C) 1999-2020 by Sonic Team Junior.
-// Copyright (C) 2023      by Kart Krew.
-//
-// This program is free software distributed under the
-// terms of the GNU General Public License, version 2.
-// See the 'LICENSE' file for more details.
-//-----------------------------------------------------------------------------
-/// \file  r_draw8_flat.c
-/// \brief 8bpp span/column drawer functions for debugging (draws in flat colors only)
-/// \note  no includes because this is included as part of r_draw.c
-
-void R_DrawColumn_Flat_8 (drawcolumndata_t* dc)
-{
-	INT32 count;
-	UINT8 color = dc->lightmap[dc->r8_flatcolor];
-	register UINT8 *dest;
-
-	count = dc->yh - dc->yl;
-
-	if (count < 0) // Zero length, column does not exceed a pixel.
-		return;
-
-#ifdef RANGECHECK
-	if ((unsigned)dc->x >= (unsigned)vid.width || dc->yl < 0 || dc->yh >= vid.height)
-		return;
-#endif
-
-	// Framebuffer destination address.
-	// Use ylookup LUT to avoid multiply with ScreenWidth.
-	// Use columnofs LUT for subwindows?
-
-	//dest = ylookup[dc_yl] + columnofs[dc_x];
-	dest = &topleft[dc->yl*vid.width + dc->x];
-
-	count++;
-
-	do
-	{
-		*dest = color;
-		dest += vid.width;
-	} while (--count);
-}
-
-void R_DrawSpan_Flat_8 (drawspandata_t* ds)
-{
-	UINT8 *dest = ylookup[ds->y] + columnofs[ds->x1];
-
-	memset(dest, ds->colormap[ds->r8_flatcolor], (ds->x2 - ds->x1) + 1);
-}
-
-void R_DrawTiltedSpan_Flat_8 (drawspandata_t* ds)
-{
-	// x1, x2 = ds_x1, ds_x2
-	int width = ds->x2 - ds->x1;
-	double iz = ds->szp.z + ds->szp.y*(centery-ds->y) + ds->szp.x*(ds->x1-centerx);
-	INT32 tiltlighting[MAXVIDWIDTH];
-
-	UINT8 *dest = ylookup[ds->y];
-
-	// Lighting is simple. It's just linear interpolation from start to end
-	{
-		float planelightfloat = PLANELIGHTFLOAT;
-		float lightstart, lightend;
-
-		lightend = (iz + ds->szp.x*width) * planelightfloat;
-		lightstart = iz * planelightfloat;
-
-		R_CalcTiltedLighting(tiltlighting, ds->x1, ds->x2, FLOAT_TO_FIXED(lightstart), FLOAT_TO_FIXED(lightend));
-		//CONS_Printf("tilted lighting %f to %f (foc %f)\n", lightstart, lightend, focallengthf);
-	}
-
-	while (ds->x1 <= ds->x2)
-	{
-		dest[ds->x1] = ds->planezlight[tiltlighting[ds->x1]][ds->r8_flatcolor];
-		ds->x1++;
-	}
-}
diff --git a/src/r_draw8_npo2.c b/src/r_draw8_npo2.c
deleted file mode 100644
index 07adefdfe..000000000
--- a/src/r_draw8_npo2.c
+++ /dev/null
@@ -1,1618 +0,0 @@
-// SONIC ROBO BLAST 2
-//-----------------------------------------------------------------------------
-// Copyright (C) 1998-2000 by DooM Legacy Team.
-// Copyright (C) 1999-2020 by Sonic Team Junior.
-//
-// This program is free software distributed under the
-// terms of the GNU General Public License, version 2.
-// See the 'LICENSE' file for more details.
-//-----------------------------------------------------------------------------
-/// \file  r_draw8_npo2.c
-/// \brief 8bpp span drawer functions (for non-powers-of-two flat dimensions)
-/// \note  no includes because this is included as part of r_draw.c
-
-// ==========================================================================
-// SPANS
-// ==========================================================================
-
-#define SPANSIZE 16
-#define INVSPAN 0.0625f
-
-/**	\brief The R_DrawSpan_NPO2_8 function
-	Draws the actual span.
-*/
-void R_DrawSpan_NPO2_8 (drawspandata_t* ds)
-{
-	fixed_t xposition;
-	fixed_t yposition;
-	fixed_t xstep, ystep;
-	fixed_t x, y;
-	fixed_t fixedwidth, fixedheight;
-
-	UINT8 *source;
-	UINT8 *colormap;
-	UINT8 *dest;
-	const UINT8 *deststop = screens[0] + vid.rowbytes * vid.height;
-
-	size_t count = (ds->x2 - ds->x1 + 1);
-
-	xposition = ds->xfrac; yposition = ds->yfrac;
-	xstep = ds->xstep; ystep = ds->ystep;
-
-	source = ds->source;
-	colormap = ds->colormap;
-	dest = ylookup[ds->y] + columnofs[ds->x1];
-
-	if (dest+8 > deststop)
-		return;
-
-	fixedwidth = ds->flatwidth << FRACBITS;
-	fixedheight = ds->flatheight << FRACBITS;
-
-	// Fix xposition and yposition if they are out of bounds.
-	if (xposition < 0)
-		xposition = fixedwidth - ((UINT32)(fixedwidth - xposition) % fixedwidth);
-	else if (xposition >= fixedwidth)
-		xposition %= fixedwidth;
-	if (yposition < 0)
-		yposition = fixedheight - ((UINT32)(fixedheight - yposition) % fixedheight);
-	else if (yposition >= fixedheight)
-		yposition %= fixedheight;
-
-	while (count-- && dest <= deststop)
-	{
-		// The loops here keep the texture coordinates within the texture.
-		// They will rarely iterate multiple times, and are cheaper than a modulo operation,
-		// even if using libdivide.
-		if (xstep < 0) // These if statements are hopefully hoisted by the compiler to above this loop
-			while (xposition < 0)
-				xposition += fixedwidth;
-		else
-			while (xposition >= fixedwidth)
-				xposition -= fixedwidth;
-		if (ystep < 0)
-			while (yposition < 0)
-				yposition += fixedheight;
-		else
-			while (yposition >= fixedheight)
-				yposition -= fixedheight;
-
-		x = (xposition >> FRACBITS);
-		y = (yposition >> FRACBITS);
-
-		*dest++ = colormap[source[((y * ds->flatwidth) + x)]];
-		xposition += xstep;
-		yposition += ystep;
-	}
-}
-
-/**	\brief The R_DrawTiltedSpan_NPO2_8 function
-	Draw slopes! Holy sheit!
-*/
-void R_DrawTiltedSpan_NPO2_8(drawspandata_t* ds)
-{
-	// x1, x2 = ds_x1, ds_x2
-	int width = ds->x2 - ds->x1;
-	double iz, uz, vz;
-	UINT32 u, v;
-	int i;
-
-	UINT8 *source;
-	UINT8 *colormap;
-	UINT8 *dest;
-
-	double startz, startu, startv;
-	double izstep, uzstep, vzstep;
-	double endz, endu, endv;
-	UINT32 stepu, stepv;
-	INT32 tiltlighting[MAXVIDWIDTH];
-
-	struct libdivide_u32_t x_divider = libdivide_u32_gen(ds->flatwidth);
-	struct libdivide_u32_t y_divider = libdivide_u32_gen(ds->flatheight);
-
-	iz = ds->szp.z + ds->szp.y*(centery-ds->y) + ds->szp.x*(ds->x1-centerx);
-
-	// Lighting is simple. It's just linear interpolation from start to end
-	{
-		float planelightfloat = PLANELIGHTFLOAT;
-		float lightstart, lightend;
-
-		lightend = (iz + ds->szp.x*width) * planelightfloat;
-		lightstart = iz * planelightfloat;
-
-		R_CalcTiltedLighting(tiltlighting, ds->x1, ds->x2, FLOAT_TO_FIXED(lightstart), FLOAT_TO_FIXED(lightend));
-		//CONS_Printf("tilted lighting %f to %f (foc %f)\n", lightstart, lightend, focallengthf);
-	}
-
-	uz = ds->sup.z + ds->sup.y*(centery-ds->y) + ds->sup.x*(ds->x1-centerx);
-	vz = ds->svp.z + ds->svp.y*(centery-ds->y) + ds->svp.x*(ds->x1-centerx);
-
-	dest = ylookup[ds->y] + columnofs[ds->x1];
-	source = ds->source;
-	//colormap = ds_colormap;
-
-#if 0	// The "perfect" reference version of this routine. Pretty slow.
-		// Use it only to see how things are supposed to look.
-	i = 0;
-	do
-	{
-		double z = 1.f/iz;
-		u = (INT64)(uz*z);
-		v = (INT64)(vz*z);
-
-		colormap = planezlight[tiltlighting[ds_x1++]] + (ds_colormap - colormaps);
-
-		// Lactozilla: Non-powers-of-two
-		{
-			fixed_t x = (((fixed_t)u) >> FRACBITS);
-			fixed_t y = (((fixed_t)v) >> FRACBITS);
-
-			// Carefully align all of my Friends.
-			if (x < 0)
-				x += (libdivide_u32_do((UINT32)(-x-1), &x_divider) + 1) * ds_flatwidth;
-			else
-				x -= libdivide_u32_do((UINT32)x, &x_divider) * ds_flatwidth;
-			if (y < 0)
-				y += (libdivide_u32_do((UINT32)(-y-1), &y_divider) + 1) * ds_flatheight;
-			else
-				y -= libdivide_u32_do((UINT32)y, &y_divider) * ds_flatheight;
-
-			*dest = colormap[source[((y * ds_flatwidth) + x)]];
-		}
-		dest++;
-		iz += ds_szp->x;
-		uz += ds_sup->x;
-		vz += ds_svp->x;
-	} while (--width >= 0);
-#else
-	startz = 1.f/iz;
-	startu = uz*startz;
-	startv = vz*startz;
-
-	izstep = ds->szp.x * SPANSIZE;
-	uzstep = ds->sup.x * SPANSIZE;
-	vzstep = ds->svp.x * SPANSIZE;
-	//x1 = 0;
-	width++;
-
-	while (width >= SPANSIZE)
-	{
-		iz += izstep;
-		uz += uzstep;
-		vz += vzstep;
-
-		endz = 1.f/iz;
-		endu = uz*endz;
-		endv = vz*endz;
-		stepu = (INT64)((endu - startu) * INVSPAN);
-		stepv = (INT64)((endv - startv) * INVSPAN);
-		u = (INT64)(startu);
-		v = (INT64)(startv);
-
-		for (i = SPANSIZE-1; i >= 0; i--)
-		{
-			colormap = ds->planezlight[tiltlighting[ds->x1++]] + (ds->colormap - colormaps);
-			// Lactozilla: Non-powers-of-two
-			{
-				fixed_t x = (((fixed_t)u) >> FRACBITS);
-				fixed_t y = (((fixed_t)v) >> FRACBITS);
-
-				// Carefully align all of my Friends.
-				if (x < 0)
-					x += (libdivide_u32_do((UINT32)(-x-1), &x_divider) + 1) * ds->flatwidth;
-				else
-					x -= libdivide_u32_do((UINT32)x, &x_divider) * ds->flatwidth;
-				if (y < 0)
-					y += (libdivide_u32_do((UINT32)(-y-1), &y_divider) + 1) * ds->flatheight;
-				else
-					y -= libdivide_u32_do((UINT32)y, &y_divider) * ds->flatheight;
-
-				*dest = colormap[source[((y * ds->flatwidth) + x)]];
-			}
-			dest++;
-			u += stepu;
-			v += stepv;
-		}
-		startu = endu;
-		startv = endv;
-		width -= SPANSIZE;
-	}
-	if (width > 0)
-	{
-		if (width == 1)
-		{
-			u = (INT64)(startu);
-			v = (INT64)(startv);
-			colormap = ds->planezlight[tiltlighting[ds->x1++]] + (ds->colormap - colormaps);
-			// Lactozilla: Non-powers-of-two
-			{
-				fixed_t x = (((fixed_t)u) >> FRACBITS);
-				fixed_t y = (((fixed_t)v) >> FRACBITS);
-
-				// Carefully align all of my Friends.
-				if (x < 0)
-					x += (libdivide_u32_do((UINT32)(-x-1), &x_divider) + 1) * ds->flatwidth;
-				else
-					x -= libdivide_u32_do((UINT32)x, &x_divider) * ds->flatwidth;
-				if (y < 0)
-					y += (libdivide_u32_do((UINT32)(-y-1), &y_divider) + 1) * ds->flatheight;
-				else
-					y -= libdivide_u32_do((UINT32)y, &y_divider) * ds->flatheight;
-
-				*dest = colormap[source[((y * ds->flatwidth) + x)]];
-			}
-		}
-		else
-		{
-			double left = width;
-			iz += ds->szp.x * left;
-			uz += ds->sup.x * left;
-			vz += ds->svp.x * left;
-
-			endz = 1.f/iz;
-			endu = uz*endz;
-			endv = vz*endz;
-			left = 1.f/left;
-			stepu = (INT64)((endu - startu) * left);
-			stepv = (INT64)((endv - startv) * left);
-			u = (INT64)(startu);
-			v = (INT64)(startv);
-
-			for (; width != 0; width--)
-			{
-				colormap = ds->planezlight[tiltlighting[ds->x1++]] + (ds->colormap - colormaps);
-				// Lactozilla: Non-powers-of-two
-				{
-					fixed_t x = (((fixed_t)u) >> FRACBITS);
-					fixed_t y = (((fixed_t)v) >> FRACBITS);
-
-					// Carefully align all of my Friends.
-					if (x < 0)
-						x += (libdivide_u32_do((UINT32)(-x-1), &x_divider) + 1) * ds->flatwidth;
-					else
-						x -= libdivide_u32_do((UINT32)x, &x_divider) * ds->flatwidth;
-					if (y < 0)
-						y += (libdivide_u32_do((UINT32)(-y-1), &y_divider) + 1) * ds->flatheight;
-					else
-						y -= libdivide_u32_do((UINT32)y, &y_divider) * ds->flatheight;
-
-					*dest = colormap[source[((y * ds->flatwidth) + x)]];
-				}
-				dest++;
-				u += stepu;
-				v += stepv;
-			}
-		}
-	}
-#endif
-}
-
-/**	\brief The R_DrawTiltedTranslucentSpan_NPO2_8 function
-	Like DrawTiltedSpan_NPO2, but translucent
-*/
-void R_DrawTiltedTranslucentSpan_NPO2_8(drawspandata_t* ds)
-{
-	// x1, x2 = ds_x1, ds_x2
-	int width = ds->x2 - ds->x1;
-	double iz, uz, vz;
-	UINT32 u, v;
-	int i;
-
-	UINT8 *source;
-	UINT8 *colormap;
-	UINT8 *dest;
-
-	double startz, startu, startv;
-	double izstep, uzstep, vzstep;
-	double endz, endu, endv;
-	UINT32 stepu, stepv;
-	INT32 tiltlighting[MAXVIDWIDTH];
-
-	struct libdivide_u32_t x_divider = libdivide_u32_gen(ds->flatwidth);
-	struct libdivide_u32_t y_divider = libdivide_u32_gen(ds->flatheight);
-
-	iz = ds->szp.z + ds->szp.y*(centery-ds->y) + ds->szp.x*(ds->x1-centerx);
-
-	// Lighting is simple. It's just linear interpolation from start to end
-	{
-		float planelightfloat = PLANELIGHTFLOAT;
-		float lightstart, lightend;
-
-		lightend = (iz + ds->szp.x*width) * planelightfloat;
-		lightstart = iz * planelightfloat;
-
-		R_CalcTiltedLighting(tiltlighting, ds->x1, ds->x2, FLOAT_TO_FIXED(lightstart), FLOAT_TO_FIXED(lightend));
-		//CONS_Printf("tilted lighting %f to %f (foc %f)\n", lightstart, lightend, focallengthf);
-	}
-
-	uz = ds->sup.z + ds->sup.y*(centery-ds->y) + ds->sup.x*(ds->x1-centerx);
-	vz = ds->svp.z + ds->svp.y*(centery-ds->y) + ds->svp.x*(ds->x1-centerx);
-
-	dest = ylookup[ds->y] + columnofs[ds->x1];
-	source = ds->source;
-	//colormap = ds_colormap;
-
-#if 0	// The "perfect" reference version of this routine. Pretty slow.
-		// Use it only to see how things are supposed to look.
-	i = 0;
-	do
-	{
-		double z = 1.f/iz;
-		u = (INT64)(uz*z);
-		v = (INT64)(vz*z);
-
-		colormap = planezlight[tiltlighting[ds_x1++]] + (ds_colormap - colormaps);
-		// Lactozilla: Non-powers-of-two
-		{
-			fixed_t x = (((fixed_t)u) >> FRACBITS);
-			fixed_t y = (((fixed_t)v) >> FRACBITS);
-
-			// Carefully align all of my Friends.
-			if (x < 0)
-				x += (libdivide_u32_do((UINT32)(-x-1), &x_divider) + 1) * ds_flatwidth;
-			else
-				x -= libdivide_u32_do((UINT32)x, &x_divider) * ds_flatwidth;
-			if (y < 0)
-				y += (libdivide_u32_do((UINT32)(-y-1), &y_divider) + 1) * ds_flatheight;
-			else
-				y -= libdivide_u32_do((UINT32)y, &y_divider) * ds_flatheight;
-
-			*dest = *(ds_transmap + (colormap[source[((y * ds_flatwidth) + x)]] << 8) + *dest);
-		}
-		dest++;
-		iz += ds_szp->x;
-		uz += ds_sup->x;
-		vz += ds_svp->x;
-	} while (--width >= 0);
-#else
-	startz = 1.f/iz;
-	startu = uz*startz;
-	startv = vz*startz;
-
-	izstep = ds->szp.x * SPANSIZE;
-	uzstep = ds->sup.x * SPANSIZE;
-	vzstep = ds->svp.x * SPANSIZE;
-	//x1 = 0;
-	width++;
-
-	while (width >= SPANSIZE)
-	{
-		iz += izstep;
-		uz += uzstep;
-		vz += vzstep;
-
-		endz = 1.f/iz;
-		endu = uz*endz;
-		endv = vz*endz;
-		stepu = (INT64)((endu - startu) * INVSPAN);
-		stepv = (INT64)((endv - startv) * INVSPAN);
-		u = (INT64)(startu);
-		v = (INT64)(startv);
-
-		for (i = SPANSIZE-1; i >= 0; i--)
-		{
-			colormap = ds->planezlight[tiltlighting[ds->x1++]] + (ds->colormap - colormaps);
-			// Lactozilla: Non-powers-of-two
-			{
-				fixed_t x = (((fixed_t)u) >> FRACBITS);
-				fixed_t y = (((fixed_t)v) >> FRACBITS);
-
-				// Carefully align all of my Friends.
-				if (x < 0)
-					x += (libdivide_u32_do((UINT32)(-x-1), &x_divider) + 1) * ds->flatwidth;
-				else
-					x -= libdivide_u32_do((UINT32)x, &x_divider) * ds->flatwidth;
-				if (y < 0)
-					y += (libdivide_u32_do((UINT32)(-y-1), &y_divider) + 1) * ds->flatheight;
-				else
-					y -= libdivide_u32_do((UINT32)y, &y_divider) * ds->flatheight;
-
-				*dest = *(ds->transmap + (colormap[source[((y * ds->flatwidth) + x)]] << 8) + *dest);
-			}
-			dest++;
-			u += stepu;
-			v += stepv;
-		}
-		startu = endu;
-		startv = endv;
-		width -= SPANSIZE;
-	}
-	if (width > 0)
-	{
-		if (width == 1)
-		{
-			u = (INT64)(startu);
-			v = (INT64)(startv);
-			colormap = ds->planezlight[tiltlighting[ds->x1++]] + (ds->colormap - colormaps);
-			// Lactozilla: Non-powers-of-two
-			{
-				fixed_t x = (((fixed_t)u) >> FRACBITS);
-				fixed_t y = (((fixed_t)v) >> FRACBITS);
-
-				// Carefully align all of my Friends.
-				if (x < 0)
-					x += (libdivide_u32_do((UINT32)(-x-1), &x_divider) + 1) * ds->flatwidth;
-				else
-					x -= libdivide_u32_do((UINT32)x, &x_divider) * ds->flatwidth;
-				if (y < 0)
-					y += (libdivide_u32_do((UINT32)(-y-1), &y_divider) + 1) * ds->flatheight;
-				else
-					y -= libdivide_u32_do((UINT32)y, &y_divider) * ds->flatheight;
-
-				*dest = *(ds->transmap + (colormap[source[((y * ds->flatwidth) + x)]] << 8) + *dest);
-			}
-		}
-		else
-		{
-			double left = width;
-			iz += ds->szp.x * left;
-			uz += ds->sup.x * left;
-			vz += ds->svp.x * left;
-
-			endz = 1.f/iz;
-			endu = uz*endz;
-			endv = vz*endz;
-			left = 1.f/left;
-			stepu = (INT64)((endu - startu) * left);
-			stepv = (INT64)((endv - startv) * left);
-			u = (INT64)(startu);
-			v = (INT64)(startv);
-
-			for (; width != 0; width--)
-			{
-				colormap = ds->planezlight[tiltlighting[ds->x1++]] + (ds->colormap - colormaps);
-				// Lactozilla: Non-powers-of-two
-				{
-					fixed_t x = (((fixed_t)u) >> FRACBITS);
-					fixed_t y = (((fixed_t)v) >> FRACBITS);
-
-					// Carefully align all of my Friends.
-					if (x < 0)
-						x += (libdivide_u32_do((UINT32)(-x-1), &x_divider) + 1) * ds->flatwidth;
-					else
-						x -= libdivide_u32_do((UINT32)x, &x_divider) * ds->flatwidth;
-					if (y < 0)
-						y += (libdivide_u32_do((UINT32)(-y-1), &y_divider) + 1) * ds->flatheight;
-					else
-						y -= libdivide_u32_do((UINT32)y, &y_divider) * ds->flatheight;
-
-					*dest = *(ds->transmap + (colormap[source[((y * ds->flatwidth) + x)]] << 8) + *dest);
-				}
-				dest++;
-				u += stepu;
-				v += stepv;
-			}
-		}
-	}
-#endif
-}
-
-void R_DrawTiltedSplat_NPO2_8(drawspandata_t* ds)
-{
-	// x1, x2 = ds_x1, ds_x2
-	int width = ds->x2 - ds->x1;
-	double iz, uz, vz;
-	UINT32 u, v;
-	int i;
-
-	UINT8 *source;
-	UINT8 *colormap;
-	UINT8 *dest;
-
-	UINT8 val;
-
-	double startz, startu, startv;
-	double izstep, uzstep, vzstep;
-	double endz, endu, endv;
-	UINT32 stepu, stepv;
-	INT32 tiltlighting[MAXVIDWIDTH];
-
-	struct libdivide_u32_t x_divider = libdivide_u32_gen(ds->flatwidth);
-	struct libdivide_u32_t y_divider = libdivide_u32_gen(ds->flatheight);
-
-	iz = ds->szp.z + ds->szp.y*(centery-ds->y) + ds->szp.x*(ds->x1-centerx);
-
-	// Lighting is simple. It's just linear interpolation from start to end
-	{
-		float planelightfloat = PLANELIGHTFLOAT;
-		float lightstart, lightend;
-
-		lightend = (iz + ds->szp.x*width) * planelightfloat;
-		lightstart = iz * planelightfloat;
-
-		R_CalcTiltedLighting(tiltlighting, ds->x1, ds->x2, FLOAT_TO_FIXED(lightstart), FLOAT_TO_FIXED(lightend));
-		//CONS_Printf("tilted lighting %f to %f (foc %f)\n", lightstart, lightend, focallengthf);
-	}
-
-	uz = ds->sup.z + ds->sup.y*(centery-ds->y) + ds->sup.x*(ds->x1-centerx);
-	vz = ds->svp.z + ds->svp.y*(centery-ds->y) + ds->svp.x*(ds->x1-centerx);
-
-	dest = ylookup[ds->y] + columnofs[ds->x1];
-	source = ds->source;
-	//colormap = ds_colormap;
-
-#if 0	// The "perfect" reference version of this routine. Pretty slow.
-		// Use it only to see how things are supposed to look.
-	i = 0;
-	do
-	{
-		double z = 1.f/iz;
-		u = (INT64)(uz*z);
-		v = (INT64)(vz*z);
-
-		colormap = planezlight[tiltlighting[ds_x1++]] + (ds_colormap - colormaps);
-
-		// Lactozilla: Non-powers-of-two
-		{
-			fixed_t x = (((fixed_t)u) >> FRACBITS);
-			fixed_t y = (((fixed_t)v) >> FRACBITS);
-
-			// Carefully align all of my Friends.
-			if (x < 0)
-				x += (libdivide_u32_do((UINT32)(-x-1), &x_divider) + 1) * ds_flatwidth;
-			else
-				x -= libdivide_u32_do((UINT32)x, &x_divider) * ds_flatwidth;
-			if (y < 0)
-				y += (libdivide_u32_do((UINT32)(-y-1), &y_divider) + 1) * ds_flatheight;
-			else
-				y -= libdivide_u32_do((UINT32)y, &y_divider) * ds_flatheight;
-
-			val = source[((y * ds_flatwidth) + x)];
-		}
-
-		if (val != TRANSPARENTPIXEL)
-			*dest = colormap[val];
-
-		dest++;
-		iz += ds_szp->x;
-		uz += ds_sup->x;
-		vz += ds_svp->x;
-	} while (--width >= 0);
-#else
-	startz = 1.f/iz;
-	startu = uz*startz;
-	startv = vz*startz;
-
-	izstep = ds->szp.x * SPANSIZE;
-	uzstep = ds->sup.x * SPANSIZE;
-	vzstep = ds->svp.x * SPANSIZE;
-	//x1 = 0;
-	width++;
-
-	while (width >= SPANSIZE)
-	{
-		iz += izstep;
-		uz += uzstep;
-		vz += vzstep;
-
-		endz = 1.f/iz;
-		endu = uz*endz;
-		endv = vz*endz;
-		stepu = (INT64)((endu - startu) * INVSPAN);
-		stepv = (INT64)((endv - startv) * INVSPAN);
-		u = (INT64)(startu);
-		v = (INT64)(startv);
-
-		for (i = SPANSIZE-1; i >= 0; i--)
-		{
-			colormap = ds->planezlight[tiltlighting[ds->x1++]] + (ds->colormap - colormaps);
-			// Lactozilla: Non-powers-of-two
-			{
-				fixed_t x = (((fixed_t)u) >> FRACBITS);
-				fixed_t y = (((fixed_t)v) >> FRACBITS);
-
-				// Carefully align all of my Friends.
-				if (x < 0)
-					x += (libdivide_u32_do((UINT32)(-x-1), &x_divider) + 1) * ds->flatwidth;
-				else
-					x -= libdivide_u32_do((UINT32)x, &x_divider) * ds->flatwidth;
-				if (y < 0)
-					y += (libdivide_u32_do((UINT32)(-y-1), &y_divider) + 1) * ds->flatheight;
-				else
-					y -= libdivide_u32_do((UINT32)y, &y_divider) * ds->flatheight;
-
-				val = source[((y * ds->flatwidth) + x)];
-			}
-			if (val != TRANSPARENTPIXEL)
-				*dest = colormap[val];
-			dest++;
-			u += stepu;
-			v += stepv;
-		}
-		startu = endu;
-		startv = endv;
-		width -= SPANSIZE;
-	}
-	if (width > 0)
-	{
-		if (width == 1)
-		{
-			u = (INT64)(startu);
-			v = (INT64)(startv);
-			colormap = ds->planezlight[tiltlighting[ds->x1++]] + (ds->colormap - colormaps);
-			// Lactozilla: Non-powers-of-two
-			{
-				fixed_t x = (((fixed_t)u) >> FRACBITS);
-				fixed_t y = (((fixed_t)v) >> FRACBITS);
-
-				// Carefully align all of my Friends.
-				if (x < 0)
-					x += (libdivide_u32_do((UINT32)(-x-1), &x_divider) + 1) * ds->flatwidth;
-				else
-					x -= libdivide_u32_do((UINT32)x, &x_divider) * ds->flatwidth;
-				if (y < 0)
-					y += (libdivide_u32_do((UINT32)(-y-1), &y_divider) + 1) * ds->flatheight;
-				else
-					y -= libdivide_u32_do((UINT32)y, &y_divider) * ds->flatheight;
-
-				val = source[((y * ds->flatwidth) + x)];
-			}
-			if (val != TRANSPARENTPIXEL)
-				*dest = colormap[val];
-		}
-		else
-		{
-			double left = width;
-			iz += ds->szp.x * left;
-			uz += ds->sup.x * left;
-			vz += ds->svp.x * left;
-
-			endz = 1.f/iz;
-			endu = uz*endz;
-			endv = vz*endz;
-			left = 1.f/left;
-			stepu = (INT64)((endu - startu) * left);
-			stepv = (INT64)((endv - startv) * left);
-			u = (INT64)(startu);
-			v = (INT64)(startv);
-
-			for (; width != 0; width--)
-			{
-				colormap = ds->planezlight[tiltlighting[ds->x1++]] + (ds->colormap - colormaps);
-				// Lactozilla: Non-powers-of-two
-				{
-					fixed_t x = (((fixed_t)u) >> FRACBITS);
-					fixed_t y = (((fixed_t)v) >> FRACBITS);
-
-					// Carefully align all of my Friends.
-					if (x < 0)
-						x += (libdivide_u32_do((UINT32)(-x-1), &x_divider) + 1) * ds->flatwidth;
-					else
-						x -= libdivide_u32_do((UINT32)x, &x_divider) * ds->flatwidth;
-					if (y < 0)
-						y += (libdivide_u32_do((UINT32)(-y-1), &y_divider) + 1) * ds->flatheight;
-					else
-						y -= libdivide_u32_do((UINT32)y, &y_divider) * ds->flatheight;
-
-					val = source[((y * ds->flatwidth) + x)];
-				}
-				if (val != TRANSPARENTPIXEL)
-					*dest = colormap[val];
-				dest++;
-				u += stepu;
-				v += stepv;
-			}
-		}
-	}
-#endif
-}
-
-/**	\brief The R_DrawSplat_NPO2_8 function
-	Just like R_DrawSpan_NPO2_8, but skips transparent pixels.
-*/
-void R_DrawSplat_NPO2_8 (drawspandata_t* ds)
-{
-	fixed_t xposition;
-	fixed_t yposition;
-	fixed_t xstep, ystep;
-	fixed_t x, y;
-	fixed_t fixedwidth, fixedheight;
-
-	UINT8 *source;
-	UINT8 *colormap;
-	UINT8 *dest;
-	const UINT8 *deststop = screens[0] + vid.rowbytes * vid.height;
-
-	size_t count = (ds->x2 - ds->x1 + 1);
-	UINT32 val;
-
-	xposition = ds->xfrac; yposition = ds->yfrac;
-	xstep = ds->xstep; ystep = ds->ystep;
-
-	source = ds->source;
-	colormap = ds->colormap;
-	dest = ylookup[ds->y] + columnofs[ds->x1];
-
-	fixedwidth = ds->flatwidth << FRACBITS;
-	fixedheight = ds->flatheight << FRACBITS;
-
-	// Fix xposition and yposition if they are out of bounds.
-	if (xposition < 0)
-		xposition = fixedwidth - ((UINT32)(fixedwidth - xposition) % fixedwidth);
-	else if (xposition >= fixedwidth)
-		xposition %= fixedwidth;
-	if (yposition < 0)
-		yposition = fixedheight - ((UINT32)(fixedheight - yposition) % fixedheight);
-	else if (yposition >= fixedheight)
-		yposition %= fixedheight;
-
-	while (count-- && dest <= deststop)
-	{
-		// The loops here keep the texture coordinates within the texture.
-		// They will rarely iterate multiple times, and are cheaper than a modulo operation,
-		// even if using libdivide.
-		if (xstep < 0) // These if statements are hopefully hoisted by the compiler to above this loop
-			while (xposition < 0)
-				xposition += fixedwidth;
-		else
-			while (xposition >= fixedwidth)
-				xposition -= fixedwidth;
-		if (ystep < 0)
-			while (yposition < 0)
-				yposition += fixedheight;
-		else
-			while (yposition >= fixedheight)
-				yposition -= fixedheight;
-
-		x = (xposition >> FRACBITS);
-		y = (yposition >> FRACBITS);
-		val = source[((y * ds->flatwidth) + x)];
-		if (val != TRANSPARENTPIXEL)
-			*dest = colormap[val];
-		dest++;
-		xposition += xstep;
-		yposition += ystep;
-	}
-}
-
-/**	\brief The R_DrawTranslucentSplat_NPO2_8 function
-	Just like R_DrawSplat_NPO2_8, but is translucent!
-*/
-void R_DrawTranslucentSplat_NPO2_8 (drawspandata_t* ds)
-{
-	fixed_t xposition;
-	fixed_t yposition;
-	fixed_t xstep, ystep;
-	fixed_t x, y;
-	fixed_t fixedwidth, fixedheight;
-
-	UINT8 *source;
-	UINT8 *colormap;
-	UINT8 *dest;
-	const UINT8 *deststop = screens[0] + vid.rowbytes * vid.height;
-
-	size_t count = (ds->x2 - ds->x1 + 1);
-	UINT32 val;
-
-	xposition = ds->xfrac; yposition = ds->yfrac;
-	xstep = ds->xstep; ystep = ds->ystep;
-
-	source = ds->source;
-	colormap = ds->colormap;
-	dest = ylookup[ds->y] + columnofs[ds->x1];
-
-	fixedwidth = ds->flatwidth << FRACBITS;
-	fixedheight = ds->flatheight << FRACBITS;
-
-	// Fix xposition and yposition if they are out of bounds.
-	if (xposition < 0)
-		xposition = fixedwidth - ((UINT32)(fixedwidth - xposition) % fixedwidth);
-	else if (xposition >= fixedwidth)
-		xposition %= fixedwidth;
-	if (yposition < 0)
-		yposition = fixedheight - ((UINT32)(fixedheight - yposition) % fixedheight);
-	else if (yposition >= fixedheight)
-		yposition %= fixedheight;
-
-	while (count-- && dest <= deststop)
-	{
-		// The loops here keep the texture coordinates within the texture.
-		// They will rarely iterate multiple times, and are cheaper than a modulo operation,
-		// even if using libdivide.
-		if (xstep < 0) // These if statements are hopefully hoisted by the compiler to above this loop
-			while (xposition < 0)
-				xposition += fixedwidth;
-		else
-			while (xposition >= fixedwidth)
-				xposition -= fixedwidth;
-		if (ystep < 0)
-			while (yposition < 0)
-				yposition += fixedheight;
-		else
-			while (yposition >= fixedheight)
-				yposition -= fixedheight;
-
-		x = (xposition >> FRACBITS);
-		y = (yposition >> FRACBITS);
-		val = source[((y * ds->flatwidth) + x)];
-		if (val != TRANSPARENTPIXEL)
-			*dest = *(ds->transmap + (colormap[val] << 8) + *dest);
-		dest++;
-		xposition += xstep;
-		yposition += ystep;
-	}
-}
-
-/**	\brief The R_DrawFloorSprite_NPO2_8 function
-	Just like R_DrawSplat_NPO2_8, but for floor sprites.
-*/
-void R_DrawFloorSprite_NPO2_8 (drawspandata_t* ds)
-{
-	fixed_t xposition;
-	fixed_t yposition;
-	fixed_t xstep, ystep;
-	fixed_t x, y;
-	fixed_t fixedwidth, fixedheight;
-
-	UINT16 *source;
-	UINT8 *translation;
-	UINT8 *colormap;
-	UINT8 *dest;
-	const UINT8 *deststop = screens[0] + vid.rowbytes * vid.height;
-
-	size_t count = (ds->x2 - ds->x1 + 1);
-	UINT32 val;
-
-	xposition = ds->xfrac; yposition = ds->yfrac;
-	xstep = ds->xstep; ystep = ds->ystep;
-
-	source = (UINT16 *)ds->source;
-	colormap = ds->colormap;
-	translation = ds->translation;
-	dest = ylookup[ds->y] + columnofs[ds->x1];
-
-	fixedwidth = ds->flatwidth << FRACBITS;
-	fixedheight = ds->flatheight << FRACBITS;
-
-	// Fix xposition and yposition if they are out of bounds.
-	if (xposition < 0)
-		xposition = fixedwidth - ((UINT32)(fixedwidth - xposition) % fixedwidth);
-	else if (xposition >= fixedwidth)
-		xposition %= fixedwidth;
-	if (yposition < 0)
-		yposition = fixedheight - ((UINT32)(fixedheight - yposition) % fixedheight);
-	else if (yposition >= fixedheight)
-		yposition %= fixedheight;
-
-	while (count-- && dest <= deststop)
-	{
-		// The loops here keep the texture coordinates within the texture.
-		// They will rarely iterate multiple times, and are cheaper than a modulo operation,
-		// even if using libdivide.
-		if (xstep < 0) // These if statements are hopefully hoisted by the compiler to above this loop
-			while (xposition < 0)
-				xposition += fixedwidth;
-		else
-			while (xposition >= fixedwidth)
-				xposition -= fixedwidth;
-		if (ystep < 0)
-			while (yposition < 0)
-				yposition += fixedheight;
-		else
-			while (yposition >= fixedheight)
-				yposition -= fixedheight;
-
-		x = (xposition >> FRACBITS);
-		y = (yposition >> FRACBITS);
-		val = source[((y * ds->flatwidth) + x)];
-		if (val & 0xFF00)
-			*dest = colormap[translation[val & 0xFF]];
-		dest++;
-		xposition += xstep;
-		yposition += ystep;
-	}
-}
-
-/**	\brief The R_DrawTranslucentFloorSprite_NPO2_8 function
-	Just like R_DrawFloorSprite_NPO2_8, but is translucent!
-*/
-void R_DrawTranslucentFloorSprite_NPO2_8 (drawspandata_t* ds)
-{
-	fixed_t xposition;
-	fixed_t yposition;
-	fixed_t xstep, ystep;
-	fixed_t x, y;
-	fixed_t fixedwidth, fixedheight;
-
-	UINT16 *source;
-	UINT8 *translation;
-	UINT8 *colormap;
-	UINT8 *dest;
-	const UINT8 *deststop = screens[0] + vid.rowbytes * vid.height;
-
-	size_t count = (ds->x2 - ds->x1 + 1);
-	UINT32 val;
-
-	xposition = ds->xfrac; yposition = ds->yfrac;
-	xstep = ds->xstep; ystep = ds->ystep;
-
-	source = (UINT16 *)ds->source;
-	colormap = ds->colormap;
-	translation = ds->translation;
-	dest = ylookup[ds->y] + columnofs[ds->x1];
-
-	fixedwidth = ds->flatwidth << FRACBITS;
-	fixedheight = ds->flatheight << FRACBITS;
-
-	// Fix xposition and yposition if they are out of bounds.
-	if (xposition < 0)
-		xposition = fixedwidth - ((UINT32)(fixedwidth - xposition) % fixedwidth);
-	else if (xposition >= fixedwidth)
-		xposition %= fixedwidth;
-	if (yposition < 0)
-		yposition = fixedheight - ((UINT32)(fixedheight - yposition) % fixedheight);
-	else if (yposition >= fixedheight)
-		yposition %= fixedheight;
-
-	while (count-- && dest <= deststop)
-	{
-		// The loops here keep the texture coordinates within the texture.
-		// They will rarely iterate multiple times, and are cheaper than a modulo operation,
-		// even if using libdivide.
-		if (xstep < 0) // These if statements are hopefully hoisted by the compiler to above this loop
-			while (xposition < 0)
-				xposition += fixedwidth;
-		else
-			while (xposition >= fixedwidth)
-				xposition -= fixedwidth;
-		if (ystep < 0)
-			while (yposition < 0)
-				yposition += fixedheight;
-		else
-			while (yposition >= fixedheight)
-				yposition -= fixedheight;
-
-		x = (xposition >> FRACBITS);
-		y = (yposition >> FRACBITS);
-		val = source[((y * ds->flatwidth) + x)];
-		if (val & 0xFF00)
-			*dest = *(ds->transmap + (colormap[translation[val & 0xFF]] << 8) + *dest);
-		dest++;
-		xposition += xstep;
-		yposition += ystep;
-	}
-}
-
-/**	\brief The R_DrawTiltedFloorSprite_NPO2_8 function
-	Draws a tilted floor sprite.
-*/
-void R_DrawTiltedFloorSprite_NPO2_8(drawspandata_t* ds)
-{
-	// x1, x2 = ds_x1, ds_x2
-	int width = ds->x2 - ds->x1;
-	double iz, uz, vz;
-	UINT32 u, v;
-	int i;
-
-	UINT16 *source;
-	UINT8 *colormap;
-	UINT8 *translation;
-	UINT8 *dest;
-	UINT16 val;
-
-	double startz, startu, startv;
-	double izstep, uzstep, vzstep;
-	double endz, endu, endv;
-	UINT32 stepu, stepv;
-
-	struct libdivide_u32_t x_divider = libdivide_u32_gen(ds->flatwidth);
-	struct libdivide_u32_t y_divider = libdivide_u32_gen(ds->flatheight);
-
-	iz = ds->szp.z + ds->szp.y*(centery-ds->y) + ds->szp.x*(ds->x1-centerx);
-	uz = ds->sup.z + ds->sup.y*(centery-ds->y) + ds->sup.x*(ds->x1-centerx);
-	vz = ds->svp.z + ds->svp.y*(centery-ds->y) + ds->svp.x*(ds->x1-centerx);
-
-	dest = ylookup[ds->y] + columnofs[ds->x1];
-	source = (UINT16 *)ds->source;
-	colormap = ds->colormap;
-	translation = ds->translation;
-
-	startz = 1.f/iz;
-	startu = uz*startz;
-	startv = vz*startz;
-
-	izstep = ds->szp.x * SPANSIZE;
-	uzstep = ds->sup.x * SPANSIZE;
-	vzstep = ds->svp.x * SPANSIZE;
-	//x1 = 0;
-	width++;
-
-	while (width >= SPANSIZE)
-	{
-		iz += izstep;
-		uz += uzstep;
-		vz += vzstep;
-
-		endz = 1.f/iz;
-		endu = uz*endz;
-		endv = vz*endz;
-		stepu = (INT64)((endu - startu) * INVSPAN);
-		stepv = (INT64)((endv - startv) * INVSPAN);
-		u = (INT64)(startu);
-		v = (INT64)(startv);
-
-		for (i = SPANSIZE-1; i >= 0; i--)
-		{
-			// Lactozilla: Non-powers-of-two
-			fixed_t x = (((fixed_t)u) >> FRACBITS);
-			fixed_t y = (((fixed_t)v) >> FRACBITS);
-
-			// Carefully align all of my Friends.
-			if (x < 0)
-				x += (libdivide_u32_do((UINT32)(-x-1), &x_divider) + 1) * ds->flatwidth;
-			else
-				x -= libdivide_u32_do((UINT32)x, &x_divider) * ds->flatwidth;
-			if (y < 0)
-				y += (libdivide_u32_do((UINT32)(-y-1), &y_divider) + 1) * ds->flatheight;
-			else
-				y -= libdivide_u32_do((UINT32)y, &y_divider) * ds->flatheight;
-
-			val = source[((y * ds->flatwidth) + x)];
-			if (val & 0xFF00)
-				*dest = colormap[translation[val & 0xFF]];
-			dest++;
-
-			u += stepu;
-			v += stepv;
-		}
-		startu = endu;
-		startv = endv;
-		width -= SPANSIZE;
-	}
-	if (width > 0)
-	{
-		if (width == 1)
-		{
-			u = (INT64)(startu);
-			v = (INT64)(startv);
-			// Lactozilla: Non-powers-of-two
-			{
-				fixed_t x = (((fixed_t)u) >> FRACBITS);
-				fixed_t y = (((fixed_t)v) >> FRACBITS);
-
-				// Carefully align all of my Friends.
-				if (x < 0)
-					x += (libdivide_u32_do((UINT32)(-x-1), &x_divider) + 1) * ds->flatwidth;
-				else
-					x -= libdivide_u32_do((UINT32)x, &x_divider) * ds->flatwidth;
-				if (y < 0)
-					y += (libdivide_u32_do((UINT32)(-y-1), &y_divider) + 1) * ds->flatheight;
-				else
-					y -= libdivide_u32_do((UINT32)y, &y_divider) * ds->flatheight;
-
-				val = source[((y * ds->flatwidth) + x)];
-				if (val & 0xFF00)
-					*dest = colormap[translation[val & 0xFF]];
-			}
-		}
-		else
-		{
-			double left = width;
-			iz += ds->szp.x * left;
-			uz += ds->sup.x * left;
-			vz += ds->svp.x * left;
-
-			endz = 1.f/iz;
-			endu = uz*endz;
-			endv = vz*endz;
-			left = 1.f/left;
-			stepu = (INT64)((endu - startu) * left);
-			stepv = (INT64)((endv - startv) * left);
-			u = (INT64)(startu);
-			v = (INT64)(startv);
-
-			for (; width != 0; width--)
-			{
-				// Lactozilla: Non-powers-of-two
-				fixed_t x = (((fixed_t)u) >> FRACBITS);
-				fixed_t y = (((fixed_t)v) >> FRACBITS);
-
-				// Carefully align all of my Friends.
-				if (x < 0)
-					x += (libdivide_u32_do((UINT32)(-x-1), &x_divider) + 1) * ds->flatwidth;
-				else
-					x -= libdivide_u32_do((UINT32)x, &x_divider) * ds->flatwidth;
-				if (y < 0)
-					y += (libdivide_u32_do((UINT32)(-y-1), &y_divider) + 1) * ds->flatheight;
-				else
-					y -= libdivide_u32_do((UINT32)y, &y_divider) * ds->flatheight;
-
-				val = source[((y * ds->flatwidth) + x)];
-				if (val & 0xFF00)
-					*dest = colormap[translation[val & 0xFF]];
-				dest++;
-
-				u += stepu;
-				v += stepv;
-			}
-		}
-	}
-}
-
-/**	\brief The R_DrawTiltedTranslucentFloorSprite_NPO2_8 function
-	Draws a tilted, translucent, floor sprite.
-*/
-void R_DrawTiltedTranslucentFloorSprite_NPO2_8(drawspandata_t* ds)
-{
-	// x1, x2 = ds_x1, ds_x2
-	int width = ds->x2 - ds->x1;
-	double iz, uz, vz;
-	UINT32 u, v;
-	int i;
-
-	UINT16 *source;
-	UINT8 *colormap;
-	UINT8 *translation;
-	UINT8 *dest;
-	UINT16 val;
-
-	double startz, startu, startv;
-	double izstep, uzstep, vzstep;
-	double endz, endu, endv;
-	UINT32 stepu, stepv;
-
-	struct libdivide_u32_t x_divider = libdivide_u32_gen(ds->flatwidth);
-	struct libdivide_u32_t y_divider = libdivide_u32_gen(ds->flatheight);
-
-	iz = ds->szp.z + ds->szp.y*(centery-ds->y) + ds->szp.x*(ds->x1-centerx);
-	uz = ds->sup.z + ds->sup.y*(centery-ds->y) + ds->sup.x*(ds->x1-centerx);
-	vz = ds->svp.z + ds->svp.y*(centery-ds->y) + ds->svp.x*(ds->x1-centerx);
-
-	dest = ylookup[ds->y] + columnofs[ds->x1];
-	source = (UINT16 *)ds->source;
-	colormap = ds->colormap;
-	translation = ds->translation;
-
-	startz = 1.f/iz;
-	startu = uz*startz;
-	startv = vz*startz;
-
-	izstep = ds->szp.x * SPANSIZE;
-	uzstep = ds->sup.x * SPANSIZE;
-	vzstep = ds->svp.x * SPANSIZE;
-	//x1 = 0;
-	width++;
-
-	while (width >= SPANSIZE)
-	{
-		iz += izstep;
-		uz += uzstep;
-		vz += vzstep;
-
-		endz = 1.f/iz;
-		endu = uz*endz;
-		endv = vz*endz;
-		stepu = (INT64)((endu - startu) * INVSPAN);
-		stepv = (INT64)((endv - startv) * INVSPAN);
-		u = (INT64)(startu);
-		v = (INT64)(startv);
-
-		for (i = SPANSIZE-1; i >= 0; i--)
-		{
-			// Lactozilla: Non-powers-of-two
-			fixed_t x = (((fixed_t)u) >> FRACBITS);
-			fixed_t y = (((fixed_t)v) >> FRACBITS);
-
-			// Carefully align all of my Friends.
-			if (x < 0)
-				x += (libdivide_u32_do((UINT32)(-x-1), &x_divider) + 1) * ds->flatwidth;
-			else
-				x -= libdivide_u32_do((UINT32)x, &x_divider) * ds->flatwidth;
-			if (y < 0)
-				y += (libdivide_u32_do((UINT32)(-y-1), &y_divider) + 1) * ds->flatheight;
-			else
-				y -= libdivide_u32_do((UINT32)y, &y_divider) * ds->flatheight;
-
-			val = source[((y * ds->flatwidth) + x)];
-			if (val & 0xFF00)
-				*dest = *(ds->transmap + (colormap[translation[val & 0xFF]] << 8) + *dest);
-			dest++;
-
-			u += stepu;
-			v += stepv;
-		}
-		startu = endu;
-		startv = endv;
-		width -= SPANSIZE;
-	}
-	if (width > 0)
-	{
-		if (width == 1)
-		{
-			u = (INT64)(startu);
-			v = (INT64)(startv);
-			// Lactozilla: Non-powers-of-two
-			{
-				fixed_t x = (((fixed_t)u) >> FRACBITS);
-				fixed_t y = (((fixed_t)v) >> FRACBITS);
-
-				// Carefully align all of my Friends.
-				if (x < 0)
-					x += (libdivide_u32_do((UINT32)(-x-1), &x_divider) + 1) * ds->flatwidth;
-				else
-					x -= libdivide_u32_do((UINT32)x, &x_divider) * ds->flatwidth;
-				if (y < 0)
-					y += (libdivide_u32_do((UINT32)(-y-1), &y_divider) + 1) * ds->flatheight;
-				else
-					y -= libdivide_u32_do((UINT32)y, &y_divider) * ds->flatheight;
-
-				val = source[((y * ds->flatwidth) + x)];
-				if (val & 0xFF00)
-					*dest = *(ds->transmap + (colormap[translation[val & 0xFF]] << 8) + *dest);
-			}
-		}
-		else
-		{
-			double left = width;
-			iz += ds->szp.x * left;
-			uz += ds->sup.x * left;
-			vz += ds->svp.x * left;
-
-			endz = 1.f/iz;
-			endu = uz*endz;
-			endv = vz*endz;
-			left = 1.f/left;
-			stepu = (INT64)((endu - startu) * left);
-			stepv = (INT64)((endv - startv) * left);
-			u = (INT64)(startu);
-			v = (INT64)(startv);
-
-			for (; width != 0; width--)
-			{
-				// Lactozilla: Non-powers-of-two
-				fixed_t x = (((fixed_t)u) >> FRACBITS);
-				fixed_t y = (((fixed_t)v) >> FRACBITS);
-
-				// Carefully align all of my Friends.
-				if (x < 0)
-					x += (libdivide_u32_do((UINT32)(-x-1), &x_divider) + 1) * ds->flatwidth;
-				else
-					x -= libdivide_u32_do((UINT32)x, &x_divider) * ds->flatwidth;
-				if (y < 0)
-					y += (libdivide_u32_do((UINT32)(-y-1), &y_divider) + 1) * ds->flatheight;
-				else
-					y -= libdivide_u32_do((UINT32)y, &y_divider) * ds->flatheight;
-
-				val = source[((y * ds->flatwidth) + x)];
-				if (val & 0xFF00)
-					*dest = *(ds->transmap + (colormap[translation[val & 0xFF]] << 8) + *dest);
-				dest++;
-
-				u += stepu;
-				v += stepv;
-			}
-		}
-	}
-}
-
-/**	\brief The R_DrawTranslucentSpan_NPO2_8 function
-	Draws the actual span with translucency.
-*/
-void R_DrawTranslucentSpan_NPO2_8 (drawspandata_t* ds)
-{
-	fixed_t xposition;
-	fixed_t yposition;
-	fixed_t xstep, ystep;
-	fixed_t x, y;
-	fixed_t fixedwidth, fixedheight;
-
-	UINT8 *source;
-	UINT8 *colormap;
-	UINT8 *dest;
-	const UINT8 *deststop = screens[0] + vid.rowbytes * vid.height;
-
-	size_t count = (ds->x2 - ds->x1 + 1);
-	UINT32 val;
-
-	xposition = ds->xfrac; yposition = ds->yfrac;
-	xstep = ds->xstep; ystep = ds->ystep;
-
-	source = ds->source;
-	colormap = ds->colormap;
-	dest = ylookup[ds->y] + columnofs[ds->x1];
-
-	fixedwidth = ds->flatwidth << FRACBITS;
-	fixedheight = ds->flatheight << FRACBITS;
-
-	// Fix xposition and yposition if they are out of bounds.
-	if (xposition < 0)
-		xposition = fixedwidth - ((UINT32)(fixedwidth - xposition) % fixedwidth);
-	else if (xposition >= fixedwidth)
-		xposition %= fixedwidth;
-	if (yposition < 0)
-		yposition = fixedheight - ((UINT32)(fixedheight - yposition) % fixedheight);
-	else if (yposition >= fixedheight)
-		yposition %= fixedheight;
-
-	while (count-- && dest <= deststop)
-	{
-		// The loops here keep the texture coordinates within the texture.
-		// They will rarely iterate multiple times, and are cheaper than a modulo operation,
-		// even if using libdivide.
-		if (xstep < 0) // These if statements are hopefully hoisted by the compiler to above this loop
-			while (xposition < 0)
-				xposition += fixedwidth;
-		else
-			while (xposition >= fixedwidth)
-				xposition -= fixedwidth;
-		if (ystep < 0)
-			while (yposition < 0)
-				yposition += fixedheight;
-		else
-			while (yposition >= fixedheight)
-				yposition -= fixedheight;
-
-		x = (xposition >> FRACBITS);
-		y = (yposition >> FRACBITS);
-		val = ((y * ds->flatwidth) + x);
-		*dest = *(ds->transmap + (colormap[source[val]] << 8) + *dest);
-		dest++;
-		xposition += xstep;
-		yposition += ystep;
-	}
-}
-
-void R_DrawTranslucentWaterSpan_NPO2_8(drawspandata_t* ds)
-{
-	fixed_t xposition;
-	fixed_t yposition;
-	fixed_t xstep, ystep;
-	fixed_t x, y;
-	fixed_t fixedwidth, fixedheight;
-
-	UINT8 *source;
-	UINT8 *colormap;
-	UINT8 *dest;
-	UINT8 *dsrc;
-	const UINT8 *deststop = screens[0] + vid.rowbytes * vid.height;
-
-	size_t count = (ds->x2 - ds->x1 + 1);
-
-	xposition = ds->xfrac; yposition = (ds->yfrac + ds->waterofs);
-	xstep = ds->xstep; ystep = ds->ystep;
-
-	source = ds->source;
-	colormap = ds->colormap;
-	dest = ylookup[ds->y] + columnofs[ds->x1];
-	dsrc = screens[1] + (ds->y+ds->bgofs)*vid.width + ds->x1;
-
-	fixedwidth = ds->flatwidth << FRACBITS;
-	fixedheight = ds->flatheight << FRACBITS;
-
-	// Fix xposition and yposition if they are out of bounds.
-	if (xposition < 0)
-		xposition = fixedwidth - ((UINT32)(fixedwidth - xposition) % fixedwidth);
-	else if (xposition >= fixedwidth)
-		xposition %= fixedwidth;
-	if (yposition < 0)
-		yposition = fixedheight - ((UINT32)(fixedheight - yposition) % fixedheight);
-	else if (yposition >= fixedheight)
-		yposition %= fixedheight;
-
-	while (count-- && dest <= deststop)
-	{
-		// The loops here keep the texture coordinates within the texture.
-		// They will rarely iterate multiple times, and are cheaper than a modulo operation,
-		// even if using libdivide.
-		if (xstep < 0) // These if statements are hopefully hoisted by the compiler to above this loop
-			while (xposition < 0)
-				xposition += fixedwidth;
-		else
-			while (xposition >= fixedwidth)
-				xposition -= fixedwidth;
-		if (ystep < 0)
-			while (yposition < 0)
-				yposition += fixedheight;
-		else
-			while (yposition >= fixedheight)
-				yposition -= fixedheight;
-
-		x = (xposition >> FRACBITS);
-		y = (yposition >> FRACBITS);
-		*dest++ = colormap[*(ds->transmap + (source[((y * ds->flatwidth) + x)] << 8) + *dsrc++)];
-		xposition += xstep;
-		yposition += ystep;
-	}
-}
-
-/**	\brief The R_DrawTiltedTranslucentWaterSpan_NPO2_8 function
-	Like DrawTiltedTranslucentSpan_NPO2, but for water
-*/
-void R_DrawTiltedTranslucentWaterSpan_NPO2_8(drawspandata_t* ds)
-{
-	// x1, x2 = ds_x1, ds_x2
-	int width = ds->x2 - ds->x1;
-	double iz, uz, vz;
-	UINT32 u, v;
-	int i;
-
-	UINT8 *source;
-	UINT8 *colormap;
-	UINT8 *dest;
-	UINT8 *dsrc;
-
-	double startz, startu, startv;
-	double izstep, uzstep, vzstep;
-	double endz, endu, endv;
-	UINT32 stepu, stepv;
-	INT32 tiltlighting[MAXVIDWIDTH];
-
-	struct libdivide_u32_t x_divider = libdivide_u32_gen(ds->flatwidth);
-	struct libdivide_u32_t y_divider = libdivide_u32_gen(ds->flatheight);
-
-	iz = ds->szp.z + ds->szp.y*(centery-ds->y) + ds->szp.x*(ds->x1-centerx);
-
-	// Lighting is simple. It's just linear interpolation from start to end
-	{
-		float planelightfloat = PLANELIGHTFLOAT;
-		float lightstart, lightend;
-
-		lightend = (iz + ds->szp.x*width) * planelightfloat;
-		lightstart = iz * planelightfloat;
-
-		R_CalcTiltedLighting(tiltlighting, ds->x1, ds->x2, FLOAT_TO_FIXED(lightstart), FLOAT_TO_FIXED(lightend));
-		//CONS_Printf("tilted lighting %f to %f (foc %f)\n", lightstart, lightend, focallengthf);
-	}
-
-	uz = ds->sup.z + ds->sup.y*(centery-ds->y) + ds->sup.x*(ds->x1-centerx);
-	vz = ds->svp.z + ds->svp.y*(centery-ds->y) + ds->svp.x*(ds->x1-centerx);
-
-	dest = ylookup[ds->y] + columnofs[ds->x1];
-	dsrc = screens[1] + (ds->y+ds->bgofs)*vid.width + ds->x1;
-	source = ds->source;
-	//colormap = ds->colormap;
-
-#if 0	// The "perfect" reference version of this routine. Pretty slow.
-		// Use it only to see how things are supposed to look.
-	i = 0;
-	do
-	{
-		double z = 1.f/iz;
-		u = (INT64)(uz*z);
-		v = (INT64)(vz*z);
-
-		colormap = planezlight[tiltlighting[ds_x1++]] + (ds_colormap - colormaps);
-		// Lactozilla: Non-powers-of-two
-		{
-			fixed_t x = (((fixed_t)u) >> FRACBITS);
-			fixed_t y = (((fixed_t)v) >> FRACBITS);
-
-			// Carefully align all of my Friends.
-			if (x < 0)
-				x += (libdivide_u32_do((UINT32)(-x-1), &x_divider) + 1) * ds_flatwidth;
-			else
-				x -= libdivide_u32_do((UINT32)x, &x_divider) * ds_flatwidth;
-			if (y < 0)
-				y += (libdivide_u32_do((UINT32)(-y-1), &y_divider) + 1) * ds_flatheight;
-			else
-				y -= libdivide_u32_do((UINT32)y, &y_divider) * ds_flatheight;
-
-			*dest = *(ds_transmap + (colormap[source[((y * ds_flatwidth) + x)]] << 8) + *dsrc++);
-		}
-		dest++;
-		iz += ds_szp->x;
-		uz += ds_sup->x;
-		vz += ds_svp->x;
-	} while (--width >= 0);
-#else
-	startz = 1.f/iz;
-	startu = uz*startz;
-	startv = vz*startz;
-
-	izstep = ds->szp.x * SPANSIZE;
-	uzstep = ds->sup.x * SPANSIZE;
-	vzstep = ds->svp.x * SPANSIZE;
-	//x1 = 0;
-	width++;
-
-	while (width >= SPANSIZE)
-	{
-		iz += izstep;
-		uz += uzstep;
-		vz += vzstep;
-
-		endz = 1.f/iz;
-		endu = uz*endz;
-		endv = vz*endz;
-		stepu = (INT64)((endu - startu) * INVSPAN);
-		stepv = (INT64)((endv - startv) * INVSPAN);
-		u = (INT64)(startu);
-		v = (INT64)(startv);
-
-		for (i = SPANSIZE-1; i >= 0; i--)
-		{
-			colormap = ds->planezlight[tiltlighting[ds->x1++]] + (ds->colormap - colormaps);
-			// Lactozilla: Non-powers-of-two
-			{
-				fixed_t x = (((fixed_t)u) >> FRACBITS);
-				fixed_t y = (((fixed_t)v) >> FRACBITS);
-
-				// Carefully align all of my Friends.
-				if (x < 0)
-					x += (libdivide_u32_do((UINT32)(-x-1), &x_divider) + 1) * ds->flatwidth;
-				else
-					x -= libdivide_u32_do((UINT32)x, &x_divider) * ds->flatwidth;
-				if (y < 0)
-					y += (libdivide_u32_do((UINT32)(-y-1), &y_divider) + 1) * ds->flatheight;
-				else
-					y -= libdivide_u32_do((UINT32)y, &y_divider) * ds->flatheight;
-
-				*dest = *(ds->transmap + (colormap[source[((y * ds->flatwidth) + x)]] << 8) + *dsrc++);
-			}
-			dest++;
-			u += stepu;
-			v += stepv;
-		}
-		startu = endu;
-		startv = endv;
-		width -= SPANSIZE;
-	}
-	if (width > 0)
-	{
-		if (width == 1)
-		{
-			u = (INT64)(startu);
-			v = (INT64)(startv);
-			colormap = ds->planezlight[tiltlighting[ds->x1++]] + (ds->colormap - colormaps);
-			// Lactozilla: Non-powers-of-two
-			{
-				fixed_t x = (((fixed_t)u) >> FRACBITS);
-				fixed_t y = (((fixed_t)v) >> FRACBITS);
-
-				// Carefully align all of my Friends.
-				if (x < 0)
-					x += (libdivide_u32_do((UINT32)(-x-1), &x_divider) + 1) * ds->flatwidth;
-				else
-					x -= libdivide_u32_do((UINT32)x, &x_divider) * ds->flatwidth;
-				if (y < 0)
-					y += (libdivide_u32_do((UINT32)(-y-1), &y_divider) + 1) * ds->flatheight;
-				else
-					y -= libdivide_u32_do((UINT32)y, &y_divider) * ds->flatheight;
-
-				*dest = *(ds->transmap + (colormap[source[((y * ds->flatwidth) + x)]] << 8) + *dsrc++);
-			}
-		}
-		else
-		{
-			double left = width;
-			iz += ds->szp.x * left;
-			uz += ds->sup.x * left;
-			vz += ds->svp.x * left;
-
-			endz = 1.f/iz;
-			endu = uz*endz;
-			endv = vz*endz;
-			left = 1.f/left;
-			stepu = (INT64)((endu - startu) * left);
-			stepv = (INT64)((endv - startv) * left);
-			u = (INT64)(startu);
-			v = (INT64)(startv);
-
-			for (; width != 0; width--)
-			{
-				colormap = ds->planezlight[tiltlighting[ds->x1++]] + (ds->colormap - colormaps);
-				// Lactozilla: Non-powers-of-two
-				{
-					fixed_t x = (((fixed_t)u) >> FRACBITS);
-					fixed_t y = (((fixed_t)v) >> FRACBITS);
-
-					// Carefully align all of my Friends.
-					if (x < 0)
-						x += (libdivide_u32_do((UINT32)(-x-1), &x_divider) + 1) * ds->flatwidth;
-					else
-						x -= libdivide_u32_do((UINT32)x, &x_divider) * ds->flatwidth;
-					if (y < 0)
-						y += (libdivide_u32_do((UINT32)(-y-1), &y_divider) + 1) * ds->flatheight;
-					else
-						y -= libdivide_u32_do((UINT32)y, &y_divider) * ds->flatheight;
-
-					*dest = *(ds->transmap + (colormap[source[((y * ds->flatwidth) + x)]] << 8) + *dsrc++);
-				}
-				dest++;
-				u += stepu;
-				v += stepv;
-			}
-		}
-	}
-#endif
-}
diff --git a/src/r_draw_column.cpp b/src/r_draw_column.cpp
new file mode 100644
index 000000000..93f6b8a8d
--- /dev/null
+++ b/src/r_draw_column.cpp
@@ -0,0 +1,413 @@
+// SONIC ROBO BLAST 2
+//-----------------------------------------------------------------------------
+// Copyright (C) 1998-2000 by DooM Legacy Team.
+// Copyright (C) 1999-2021 by Sonic Team Junior.
+//
+// This program is free software distributed under the
+// terms of the GNU General Public License, version 2.
+// See the 'LICENSE' file for more details.
+//-----------------------------------------------------------------------------
+/// \file  r_draw_column.cpp
+/// \brief column drawer functions
+/// \note  no includes because this is included as part of r_draw.cpp
+
+// ==========================================================================
+// COLUMNS
+// ==========================================================================
+
+// A column is a vertical slice/span of a wall texture that uses
+// a has a constant z depth from top to bottom.
+//
+
+enum DrawColumnType
+{
+	DC_BASIC			= 0x0000,
+	DC_COLORMAP			= 0x0001,
+	DC_TRANSMAP			= 0x0002,
+	DC_BRIGHTMAP		= 0x0004,
+	DC_HOLES			= 0x0008,
+	DC_LIGHTLIST		= 0x0010,
+};
+
+template<DrawColumnType Type>
+static constexpr UINT8 R_GetColumnTranslated(drawcolumndata_t* dc, UINT8 col)
+{
+	if constexpr (Type & DrawColumnType::DC_COLORMAP)
+	{
+		return dc->translation[col];
+	}
+	else
+	{
+		return col;
+	}
+}
+
+template<DrawColumnType Type>
+static constexpr UINT8 R_GetColumnBrightmapped(drawcolumndata_t* dc, UINT32 bit, UINT8 col)
+{
+	col = R_GetColumnTranslated<Type>(dc, col);
+
+	if constexpr (Type & DrawColumnType::DC_BRIGHTMAP)
+	{
+		if (dc->brightmap[bit] == BRIGHTPIXEL)
+		{
+			return dc->fullbright[col];
+		}
+	}
+
+	return dc->colormap[col];
+}
+
+template<DrawColumnType Type>
+static constexpr UINT8 R_GetColumnTranslucent(drawcolumndata_t* dc, UINT8 *dest, UINT32 bit, UINT8 col)
+{
+	col = R_GetColumnBrightmapped<Type>(dc, bit, col);
+
+	if constexpr (Type & DrawColumnType::DC_TRANSMAP)
+	{
+		return *(dc->transmap + (col << 8) + (*dest));
+	}
+	else
+	{
+		return col;
+	}
+}
+
+template<DrawColumnType Type>
+static constexpr UINT8 R_DrawColumnPixel(drawcolumndata_t* dc, UINT8 *dest, UINT32 bit)
+{
+	UINT8 col = dc->source[bit];
+
+	if constexpr (Type & DrawColumnType::DC_HOLES)
+	{
+		if (col == TRANSPARENTPIXEL)
+		{
+			return *dest;
+		}
+	}
+
+	return R_GetColumnTranslucent<Type>(dc, dest, bit, col);
+}
+
+/**	\brief The R_DrawColumn function
+	Experiment to make software go faster. Taken from the Boom source
+*/
+template<DrawColumnType Type>
+static void R_DrawColumnTemplate(drawcolumndata_t *dc)
+{
+	INT32 count;
+	UINT8 *dest;
+
+	count = dc->yh - dc->yl;
+
+	if (count < 0) // Zero length, column does not exceed a pixel.
+	{
+		return;
+	}
+
+#ifdef RANGECHECK
+	if ((unsigned)dc->x >= (unsigned)vid.width || dc->yl < 0 || dc->yh >= vid.height)
+	{
+		return;
+	}
+#endif
+
+	if constexpr (Type & DrawColumnType::DC_LIGHTLIST)
+	{
+		constexpr DrawColumnType NewType = static_cast<DrawColumnType>(Type & ~DC_LIGHTLIST);
+		INT32 i, realyh, height, bheight = 0, solid = 0;
+		drawcolumndata_t dc_copy = *dc;
+
+		realyh = dc_copy.yh;
+
+		// This runs through the lightlist from top to bottom and cuts up the column accordingly.
+		for (i = 0; i < dc->numlights; i++)
+		{
+			// If the height of the light is above the column, get the colormap
+			// anyway because the lighting of the top should be affected.
+			solid = dc->lightlist[i].flags & FOF_CUTSOLIDS;
+			height = dc->lightlist[i].height >> LIGHTSCALESHIFT;
+
+			if (solid)
+			{
+				bheight = dc->lightlist[i].botheight >> LIGHTSCALESHIFT;
+
+				if (bheight < height)
+				{
+					// confounded slopes sometimes allow partial invertedness,
+					// even including cases where the top and bottom heights
+					// should actually be the same!
+					// swap the height values as a workaround for this quirk
+					INT32 temp = height;
+					height = bheight;
+					bheight = temp;
+				}
+			}
+
+			if (height <= dc_copy.yl)
+			{
+				dc_copy.colormap = dc->lightlist[i].rcolormap;
+				dc_copy.fullbright = colormaps;
+
+				if (encoremap)
+				{
+					dc_copy.colormap += COLORMAP_REMAPOFFSET;
+					dc_copy.fullbright += COLORMAP_REMAPOFFSET;
+				}
+
+				if (solid && dc_copy.yl < bheight)
+				{
+					dc_copy.yl = bheight;
+				}
+
+				continue;
+			}
+
+			// Found a break in the column!
+			dc_copy.yh = height;
+
+			if (dc_copy.yh > realyh)
+			{
+				dc_copy.yh = realyh;
+			}
+
+			R_DrawColumnTemplate<NewType>(&dc_copy);
+			if (solid)
+			{
+				dc_copy.yl = bheight;
+			}
+			else
+			{
+				dc_copy.yl = dc_copy.yh + 1;
+			}
+
+			dc_copy.colormap = dc_copy.lightlist[i].rcolormap;
+			dc_copy.fullbright = colormaps;
+			if (encoremap)
+			{
+				dc_copy.colormap += COLORMAP_REMAPOFFSET;
+				dc_copy.fullbright += COLORMAP_REMAPOFFSET;
+			}
+		}
+
+		dc_copy.yh = realyh;
+
+		if (dc_copy.yl <= realyh)
+		{
+			R_DrawColumnTemplate<NewType>(&dc_copy);
+		}
+	}
+	else
+	{
+		fixed_t fracstep;
+		fixed_t frac;
+		INT32 heightmask;
+
+		// Framebuffer destination address.
+		// Use ylookup LUT to avoid multiply with ScreenWidth.
+		// Use columnofs LUT for subwindows?
+
+		//dest = ylookup[dc_yl] + columnofs[dc_x];
+		dest = &topleft[dc->yl * vid.width + dc->x];
+
+		count++;
+
+		// Determine scaling, which is the only mapping to be done.
+		fracstep = dc->iscale;
+		//frac = dc_texturemid + (dc_yl - centery)*fracstep;
+		frac = (dc->texturemid + FixedMul((dc->yl << FRACBITS) - centeryfrac, fracstep)) * (!dc->hires);
+
+		// Inner loop that does the actual texture mapping, e.g. a DDA-like scaling.
+		// This is as fast as it gets.
+		heightmask = dc->texheight-1;
+
+		if (dc->texheight & heightmask)   // not a power of 2 -- killough
+		{
+			heightmask++;
+			heightmask <<= FRACBITS;
+
+			if (frac < 0)
+			{
+				while ((frac += heightmask) < 0)
+				{
+					;
+				}
+			}
+			else
+			{
+				while (frac >= heightmask)
+				{
+					frac -= heightmask;
+				}
+			}
+
+			do
+			{
+				// Re-map color indices from wall texture column
+				//  using a lighting/special effects LUT.
+				// heightmask is the Tutti-Frutti fix
+				*dest = R_DrawColumnPixel<Type>(dc, dest, frac >> FRACBITS);
+
+				dest += vid.width;
+
+				// Avoid overflow.
+				if (fracstep > 0x7FFFFFFF - frac)
+				{
+					frac += fracstep - heightmask;
+				}
+				else
+				{
+					frac += fracstep;
+				}
+
+				while (frac >= heightmask)
+				{
+					frac -= heightmask;
+				}
+			}
+			while (--count);
+		}
+		else
+		{
+			while ((count -= 2) >= 0) // texture height is a power of 2
+			{
+				*dest = R_DrawColumnPixel<Type>(dc, dest, (frac>>FRACBITS) & heightmask);
+
+				dest += vid.width;
+				frac += fracstep;
+
+				*dest = R_DrawColumnPixel<Type>(dc, dest, (frac>>FRACBITS) & heightmask);
+
+				dest += vid.width;
+				frac += fracstep;
+			}
+
+			if (count & 1)
+			{
+				*dest = R_DrawColumnPixel<Type>(dc, dest, (frac>>FRACBITS) & heightmask);
+			}
+		}
+	}
+}
+
+#define DEFINE_COLUMN_FUNC(name, flags) \
+	void name(drawcolumndata_t *dc) \
+	{ \
+		ZoneScoped; \
+		constexpr DrawColumnType opt = static_cast<DrawColumnType>(flags); \
+		R_DrawColumnTemplate<opt>(dc); \
+	}
+
+#define DEFINE_COLUMN_COMBO(name, flags) \
+	DEFINE_COLUMN_FUNC(name, flags) \
+	DEFINE_COLUMN_FUNC(name ## _Brightmap, flags|DC_BRIGHTMAP)
+
+DEFINE_COLUMN_COMBO(R_DrawColumn, DC_BASIC)
+DEFINE_COLUMN_COMBO(R_DrawTranslucentColumn, DC_TRANSMAP)
+DEFINE_COLUMN_COMBO(R_DrawTranslatedColumn, DC_COLORMAP)
+DEFINE_COLUMN_COMBO(R_DrawColumnShadowed, DC_LIGHTLIST)
+DEFINE_COLUMN_COMBO(R_DrawTranslatedTranslucentColumn, DC_COLORMAP|DC_TRANSMAP)
+DEFINE_COLUMN_COMBO(R_Draw2sMultiPatchColumn, DC_HOLES)
+DEFINE_COLUMN_COMBO(R_Draw2sMultiPatchTranslucentColumn, DC_HOLES|DC_TRANSMAP)
+
+void R_DrawFogColumn(drawcolumndata_t *dc)
+{
+	ZoneScoped;
+
+	INT32 count;
+	UINT8 *dest;
+
+	count = dc->yh - dc->yl;
+
+	// Zero length, column does not exceed a pixel.
+	if (count < 0)
+		return;
+
+#ifdef RANGECHECK
+	if ((unsigned)dc->x >= (unsigned)vid.width || dc->yl < 0 || dc->yh >= vid.height)
+		return;
+#endif
+
+	// Framebuffer destination address.
+	// Use ylookup LUT to avoid multiply with ScreenWidth.
+	// Use columnofs LUT for subwindows?
+	//dest = ylookup[dc_yl] + columnofs[dc_x];
+	dest = &topleft[dc->yl*vid.width + dc->x];
+
+	// Determine scaling, which is the only mapping to be done.
+	do
+	{
+		// Simple. Apply the colormap to what's already on the screen.
+		*dest = dc->colormap[*dest];
+		dest += vid.width;
+	}
+	while (count--);
+}
+
+void R_DrawDropShadowColumn(drawcolumndata_t *dc)
+{
+	ZoneScoped;
+
+	// Hack: A cut-down copy of R_DrawTranslucentColumn_8 that does not read texture
+	// data since something about calculating the texture reading address for drop shadows is broken.
+	// dc_texturemid and dc_iscale get wrong values for drop shadows, however those are not strictly
+	// needed for the current design of the shadows, so this function bypasses the issue
+	// by not using those variables at all.
+
+	INT32 count;
+	UINT8 *dest;
+
+	count = dc->yh - dc->yl + 1;
+
+	if (count <= 0) // Zero length, column does not exceed a pixel.
+		return;
+
+	dest = &topleft[dc->yl*vid.width + dc->x];
+
+	const UINT8 *transmap_offset = dc->transmap + (dc->shadowcolor << 8);
+	while ((count -= 2) >= 0)
+	{
+		*dest = *(transmap_offset + (*dest));
+		dest += vid.width;
+		*dest = *(transmap_offset + (*dest));
+		dest += vid.width;
+	}
+
+	if (count & 1)
+		*dest = *(transmap_offset + (*dest));
+}
+
+void R_DrawColumn_Flat(drawcolumndata_t *dc)
+{
+	ZoneScoped;
+
+	INT32 count;
+	UINT8 color = dc->lightmap[dc->r8_flatcolor];
+	UINT8 *dest;
+
+	count = dc->yh - dc->yl;
+
+	if (count < 0) // Zero length, column does not exceed a pixel.
+		return;
+
+#ifdef RANGECHECK
+	if ((unsigned)dc->x >= (unsigned)vid.width || dc->yl < 0 || dc->yh >= vid.height)
+		return;
+#endif
+
+	// Framebuffer destination address.
+	// Use ylookup LUT to avoid multiply with ScreenWidth.
+	// Use columnofs LUT for subwindows?
+
+	//dest = ylookup[dc_yl] + columnofs[dc_x];
+	dest = &topleft[dc->yl*vid.width + dc->x];
+
+	count++;
+
+	do
+	{
+		*dest = color;
+		dest += vid.width;
+	}
+	while (--count);
+}
diff --git a/src/r_draw_span.cpp b/src/r_draw_span.cpp
new file mode 100644
index 000000000..bd7d2cc41
--- /dev/null
+++ b/src/r_draw_span.cpp
@@ -0,0 +1,866 @@
+// SONIC ROBO BLAST 2
+//-----------------------------------------------------------------------------
+// Copyright (C) 1998-2000 by DooM Legacy Team.
+// Copyright (C) 1999-2021 by Sonic Team Junior.
+//
+// This program is free software distributed under the
+// terms of the GNU General Public License, version 2.
+// See the 'LICENSE' file for more details.
+//-----------------------------------------------------------------------------
+/// \file  r_draw_span.cpp
+/// \brief span drawer functions
+/// \note  no includes because this is included as part of r_draw.cpp
+
+using namespace libdivide;
+
+// ==========================================================================
+// SPANS
+// ==========================================================================
+
+#define SPANSIZE 16
+#define INVSPAN 0.0625f
+
+// <Callum> 4194303 = (2048x2048)-1 (2048x2048 is maximum flat size)
+#define MAXFLATBYTES 4194303
+
+#define PLANELIGHTFLOAT (BASEVIDWIDTH * BASEVIDWIDTH / vid.width / ds->zeroheight / 21.0f * FIXED_TO_FLOAT(fovtan[viewssnum]))
+
+enum DrawSpanType
+{
+	DS_BASIC			= 0x0000,
+	DS_COLORMAP			= 0x0001,
+	DS_TRANSMAP			= 0x0002,
+	DS_BRIGHTMAP		= 0x0004,
+	DS_HOLES			= 0x0008,
+	DS_RIPPLE			= 0x0010,
+	DS_SPRITE			= 0x0020,
+};
+
+template<DrawSpanType Type>
+static constexpr UINT8 R_GetSpanTranslated(drawspandata_t* ds, UINT8 col)
+{
+	if constexpr (Type & DrawSpanType::DS_COLORMAP)
+	{
+		return ds->translation[col];
+	}
+	else
+	{
+		return col;
+	}
+}
+
+template<DrawSpanType Type>
+static constexpr UINT8 R_GetSpanBrightmapped(drawspandata_t* ds, UINT8 *colormap, UINT32 bit, UINT8 col)
+{
+	col = R_GetSpanTranslated<Type>(ds, col);
+
+	if constexpr (Type & DrawSpanType::DS_BRIGHTMAP)
+	{
+		UINT8 brightCol = 31;
+
+		if constexpr (Type & DrawSpanType::DS_SPRITE)
+		{
+			UINT16 *spriteSource = reinterpret_cast<UINT16 *>(ds->brightmap);
+			UINT16 spriteCol = spriteSource[bit];
+
+			if (spriteCol & 0xFF00)
+			{
+				brightCol = (spriteCol & 0xFF);
+			}
+		}
+		else
+		{
+			brightCol = ds->brightmap[bit];
+		}
+
+		if (brightCol == BRIGHTPIXEL)
+		{
+			return ds->fullbright[col];
+		}
+	}
+
+	return colormap[col];
+}
+
+template<DrawSpanType Type>
+static constexpr UINT8 R_GetSpanTranslucent(drawspandata_t* ds, UINT8 *dsrc, UINT8 *colormap, UINT32 bit, UINT8 col)
+{
+	col = R_GetSpanBrightmapped<Type>(ds, colormap, bit, col);
+
+	if constexpr (Type & DrawSpanType::DS_TRANSMAP)
+	{
+		return *(ds->transmap + (col << 8) + (*dsrc));
+	}
+	else
+	{
+		return col;
+	}
+}
+
+template<DrawSpanType Type>
+static constexpr UINT8 R_DrawSpanPixel(drawspandata_t* ds, UINT8 *dsrc, UINT8 *colormap, UINT32 bit)
+{
+	UINT8 col = 0;
+
+	if constexpr (Type & DrawSpanType::DS_SPRITE)
+	{
+		UINT16 *spriteSource = reinterpret_cast<UINT16 *>(ds->source);
+		UINT16 spriteCol = spriteSource[bit];
+
+		if (spriteCol & 0xFF00)
+		{
+			col = (spriteCol & 0xFF);
+		}
+		else
+		{
+			return *dsrc;
+		}
+	}
+	else
+	{
+		col = ds->source[bit];
+	}
+
+	if constexpr (Type & DrawSpanType::DS_HOLES)
+	{
+		if (col == TRANSPARENTPIXEL)
+		{
+			return *dsrc;
+		}
+	}
+
+	return R_GetSpanTranslucent<Type>(ds, dsrc, colormap, bit, col);
+}
+
+/**	\brief The R_DrawSpan_8 function
+	Draws the actual span.
+*/
+template<DrawSpanType Type>
+static void R_DrawSpanTemplate(drawspandata_t* ds)
+{
+	fixed_t xposition;
+	fixed_t yposition;
+	fixed_t xstep, ystep;
+	UINT32 bit;
+
+	UINT8 *dest;
+	UINT8 *dsrc;
+
+	const UINT8 *deststop = screens[0] + vid.rowbytes * vid.height;
+
+	size_t count = (ds->x2 - ds->x1 + 1);
+	size_t i;
+
+	xposition = ds->xfrac; yposition = ds->yfrac;
+	xstep = ds->xstep; ystep = ds->ystep;
+
+	if constexpr (Type & DS_RIPPLE)
+	{
+		yposition += ds->waterofs;
+	}
+
+	// SoM: we only need 6 bits for the integer part (0 thru 63) so the rest
+	// can be used for the fraction part. This allows calculation of the memory address in the
+	// texture with two shifts, an OR and one AND. (see below)
+	// for texture sizes > 64 the amount of precision we can allow will decrease, but only by one
+	// bit per power of two (obviously)
+	// Ok, because I was able to eliminate the variable spot below, this function is now FASTER
+	// than the original span renderer. Whodathunkit?
+	xposition <<= ds->nflatshiftup; yposition <<= ds->nflatshiftup;
+	xstep <<= ds->nflatshiftup; ystep <<= ds->nflatshiftup;
+
+	dest = ylookup[ds->y] + columnofs[ds->x1];
+	if constexpr (Type & DS_RIPPLE)
+	{
+		dsrc = screens[1] + (ds->y + ds->bgofs) * vid.width + ds->x1;
+	}
+	else
+	{
+		dsrc = dest;
+	}
+
+	if (dest+8 > deststop)
+	{
+		return;
+	}
+
+	while (count >= 8)
+	{
+		// SoM: Why didn't I see this earlier? the spot variable is a waste now because we don't
+		// have the uber complicated math to calculate it now, so that was a memory write we didn't
+		// need!
+
+		for (i = 0; i < 8; i++)
+		{
+			bit = (((UINT32)yposition >> ds->nflatyshift) & ds->nflatmask) | ((UINT32)xposition >> ds->nflatxshift);
+
+			dest[i] = R_DrawSpanPixel<Type>(ds, dsrc, ds->colormap, bit);
+
+			xposition += xstep;
+			yposition += ystep;
+		}
+
+		dest += 8;
+		dsrc += 8;
+
+		count -= 8;
+	}
+
+	while (count-- && dest <= deststop)
+	{
+		bit = (((UINT32)yposition >> ds->nflatyshift) & ds->nflatmask) | ((UINT32)xposition >> ds->nflatxshift);
+
+		*dest = R_DrawSpanPixel<Type>(ds, dsrc, ds->colormap, bit);
+
+		dest++;
+		dsrc++;
+
+		xposition += xstep;
+		yposition += ystep;
+	}
+}
+
+// R_CalcTiltedLighting
+// Exactly what it says on the tin. I wish I wasn't too lazy to explain things properly.
+static void R_CalcTiltedLighting(INT32 *lightbuffer, INT32 x1, INT32 x2, fixed_t start, fixed_t end)
+{
+	// ZDoom uses a different lighting setup to us, and I couldn't figure out how to adapt their version
+	// of this function. Here's my own.
+	INT32 left = x1, right = x2;
+	fixed_t step = (end-start)/(x2 - x1 + 1);
+	INT32 i;
+
+	// I wanna do some optimizing by checking for out-of-range segments on either side to fill in all at once,
+	// but I'm too bad at coding to not crash the game trying to do that. I guess this is fast enough for now...
+
+	for (i = left; i <= right; i++)
+	{
+		lightbuffer[i] = (start += step) >> FRACBITS;
+
+		if (lightbuffer[i] < 0)
+		{
+			lightbuffer[i] = 0;
+		}
+		else if (lightbuffer[i] >= MAXLIGHTSCALE)
+		{
+			lightbuffer[i] = MAXLIGHTSCALE-1;
+		}
+	}
+}
+
+template<DrawSpanType Type>
+static void R_DrawTiltedSpanTemplate(drawspandata_t* ds)
+{
+	// x1, x2 = ds_x1, ds_x2
+	int width = ds->x2 - ds->x1;
+	double iz, uz, vz;
+	UINT32 u, v;
+	int i;
+
+	UINT8 *colormap;
+	UINT8 *dest;
+	UINT8 *dsrc;
+
+	double startz, startu, startv;
+	double izstep, uzstep, vzstep;
+	double endz, endu, endv;
+	UINT32 stepu, stepv;
+	UINT32 bit;
+	INT32 tiltlighting[MAXVIDWIDTH];
+
+	INT32 x1 = ds->x1;
+	const INT32 nflatxshift = ds->nflatxshift;
+	const INT32 nflatyshift = ds->nflatyshift;
+	const INT32 nflatmask = ds->nflatmask;
+
+	iz = ds->szp.z + ds->szp.y*(centery-ds->y) + ds->szp.x*(ds->x1-centerx);
+
+	// Lighting is simple. It's just linear interpolation from start to end
+	if constexpr (!(Type & DS_SPRITE))
+	{
+		float planelightfloat = PLANELIGHTFLOAT;
+		float lightstart, lightend;
+
+		lightend = (iz + ds->szp.x*width) * planelightfloat;
+		lightstart = iz * planelightfloat;
+
+		R_CalcTiltedLighting(tiltlighting, ds->x1, ds->x2, FLOAT_TO_FIXED(lightstart), FLOAT_TO_FIXED(lightend));
+		//CONS_Printf("tilted lighting %f to %f (foc %f)\n", lightstart, lightend, focallengthf);
+	}
+
+	uz = ds->sup.z + ds->sup.y*(centery-ds->y) + ds->sup.x*(ds->x1-centerx);
+	vz = ds->svp.z + ds->svp.y*(centery-ds->y) + ds->svp.x*(ds->x1-centerx);
+
+	colormap = ds->colormap;
+
+	dest = ylookup[ds->y] + columnofs[ds->x1];
+	if constexpr (Type & DS_RIPPLE)
+	{
+		dsrc = screens[1] + (ds->y + ds->bgofs) * vid.width + ds->x1;
+	}
+	else
+	{
+		dsrc = dest;
+	}
+
+#if 0	// The "perfect" reference version of this routine. Pretty slow.
+		// Use it only to see how things are supposed to look.
+	i = 0;
+	do
+	{
+		double z = 1.f/iz;
+		u = (INT64)(uz*z);
+		v = (INT64)(vz*z);
+
+		bit = ((v >> nflatyshift) & nflatmask) | (u >> nflatxshift);
+		if constexpr (!(Type & DS_SPRITE))
+		{
+			colormap = planezlight[tiltlighting[ds->x1]] + (ds->colormap - colormaps);
+		}
+		*dest = R_DrawSpanPixel<Type>(ds, dsrc, colormap, bit);
+		dest++;
+		ds->x1++;
+		dsrc++;
+		iz += ds_szp->x;
+		uz += ds_sup->x;
+		vz += ds_svp->x;
+	} while (--width >= 0);
+#else
+	startz = 1.f/iz;
+	startu = uz*startz;
+	startv = vz*startz;
+
+	izstep = ds->szp.x * SPANSIZE;
+	uzstep = ds->sup.x * SPANSIZE;
+	vzstep = ds->svp.x * SPANSIZE;
+	//x1 = 0;
+	width++;
+
+	while (width >= SPANSIZE)
+	{
+		iz += izstep;
+		uz += uzstep;
+		vz += vzstep;
+
+		endz = 1.f/iz;
+		endu = uz*endz;
+		endv = vz*endz;
+		stepu = (INT64)((endu - startu) * INVSPAN);
+		stepv = (INT64)((endv - startv) * INVSPAN);
+		u = (INT64)(startu);
+		v = (INT64)(startv);
+
+		x1 = ds->x1;
+
+		for (i = 0; i < SPANSIZE; i++)
+		{
+			bit = (((v + stepv * i) >> nflatyshift) & nflatmask) | ((u + stepu * i) >> nflatxshift);
+
+			if constexpr (!(Type & DS_SPRITE))
+			{
+				colormap = ds->planezlight[tiltlighting[x1 + i]] + (ds->colormap - colormaps);
+			}
+
+			dest[i] = R_DrawSpanPixel<Type>(ds, &dsrc[i], colormap, bit);
+		}
+
+		ds->x1 += SPANSIZE;
+		dest += SPANSIZE;
+		dsrc += SPANSIZE;
+		startu = endu;
+		startv = endv;
+		width -= SPANSIZE;
+	}
+
+	if (width > 0)
+	{
+		if (width == 1)
+		{
+			u = (INT64)(startu);
+			v = (INT64)(startv);
+			bit = ((v >> nflatyshift) & nflatmask) | (u >> nflatxshift);
+			if constexpr (!(Type & DS_SPRITE))
+			{
+				colormap = ds->planezlight[tiltlighting[ds->x1]] + (ds->colormap - colormaps);
+			}
+			*dest = R_DrawSpanPixel<Type>(ds, dsrc, colormap, bit);
+			ds->x1++;
+		}
+		else
+		{
+			double left = width;
+			iz += ds->szp.x * left;
+			uz += ds->sup.x * left;
+			vz += ds->svp.x * left;
+
+			endz = 1.f/iz;
+			endu = uz*endz;
+			endv = vz*endz;
+			left = 1.f/left;
+			stepu = (INT64)((endu - startu) * left);
+			stepv = (INT64)((endv - startv) * left);
+			u = (INT64)(startu);
+			v = (INT64)(startv);
+
+			for (; width != 0; width--)
+			{
+				bit = ((v >> ds->nflatyshift) & ds->nflatmask) | (u >> ds->nflatxshift);
+				if constexpr (!(Type & DS_SPRITE))
+				{
+					colormap = ds->planezlight[tiltlighting[ds->x1]] + (ds->colormap - colormaps);
+				}
+				*dest = R_DrawSpanPixel<Type>(ds, dsrc, colormap, bit);
+				dest++;
+				ds->x1++;
+				dsrc++;
+				u += stepu;
+				v += stepv;
+			}
+		}
+	}
+#endif
+}
+
+/**	\brief The R_DrawSpan_NPO2 function
+	Draws the actual span.
+*/
+template<DrawSpanType Type>
+static void R_DrawNPO2SpanTemplate(drawspandata_t* ds)
+{
+	fixed_t xposition;
+	fixed_t yposition;
+	fixed_t xstep, ystep;
+	fixed_t x, y;
+	fixed_t fixedwidth, fixedheight;
+
+	UINT8 *dest;
+	UINT8 *dsrc;
+	const UINT8 *deststop = screens[0] + vid.rowbytes * vid.height;
+
+	size_t count = (ds->x2 - ds->x1 + 1);
+
+	xposition = ds->xfrac; yposition = ds->yfrac;
+	xstep = ds->xstep; ystep = ds->ystep;
+
+	if constexpr (Type & DS_RIPPLE)
+	{
+		yposition += ds->waterofs;
+	}
+
+	dest = ylookup[ds->y] + columnofs[ds->x1];
+
+	if constexpr (Type & DS_RIPPLE)
+	{
+		dsrc = screens[1] + (ds->y + ds->bgofs) * vid.width + ds->x1;
+	}
+	else
+	{
+		dsrc = dest;
+	}
+
+	if (dest+8 > deststop)
+		return;
+
+	fixedwidth = ds->flatwidth << FRACBITS;
+	fixedheight = ds->flatheight << FRACBITS;
+
+	// Fix xposition and yposition if they are out of bounds.
+	if (xposition < 0)
+		xposition = fixedwidth - ((UINT32)(fixedwidth - xposition) % fixedwidth);
+	else if (xposition >= fixedwidth)
+		xposition %= fixedwidth;
+	if (yposition < 0)
+		yposition = fixedheight - ((UINT32)(fixedheight - yposition) % fixedheight);
+	else if (yposition >= fixedheight)
+		yposition %= fixedheight;
+
+	while (count-- && dest <= deststop)
+	{
+		// The loops here keep the texture coordinates within the texture.
+		// They will rarely iterate multiple times, and are cheaper than a modulo operation,
+		// even if using libdivide.
+		if (xstep < 0) // These if statements are hopefully hoisted by the compiler to above this loop
+			while (xposition < 0)
+				xposition += fixedwidth;
+		else
+			while (xposition >= fixedwidth)
+				xposition -= fixedwidth;
+		if (ystep < 0)
+			while (yposition < 0)
+				yposition += fixedheight;
+		else
+			while (yposition >= fixedheight)
+				yposition -= fixedheight;
+
+		x = (xposition >> FRACBITS);
+		y = (yposition >> FRACBITS);
+
+		*dest = R_DrawSpanPixel<Type>(ds, dsrc, ds->colormap, ((y * ds->flatwidth) + x));
+		dest++;
+		dsrc++;
+
+		xposition += xstep;
+		yposition += ystep;
+	}
+}
+
+/**	\brief The R_DrawTiltedSpan_NPO2_8 function
+	Draw slopes! Holy sheit!
+*/
+template<DrawSpanType Type>
+static void R_DrawTiltedNPO2SpanTemplate(drawspandata_t* ds)
+{
+	// x1, x2 = ds_x1, ds_x2
+	int width = ds->x2 - ds->x1;
+	double iz, uz, vz;
+	UINT32 u, v;
+	int i;
+
+	UINT8 *colormap;
+	UINT8 *dest;
+	UINT8 *dsrc;
+
+	double startz, startu, startv;
+	double izstep, uzstep, vzstep;
+	double endz, endu, endv;
+	UINT32 stepu, stepv;
+	INT32 tiltlighting[MAXVIDWIDTH];
+
+	struct libdivide_u32_t x_divider = libdivide_u32_gen(ds->flatwidth);
+	struct libdivide_u32_t y_divider = libdivide_u32_gen(ds->flatheight);
+
+	iz = ds->szp.z + ds->szp.y*(centery-ds->y) + ds->szp.x*(ds->x1-centerx);
+
+	// Lighting is simple. It's just linear interpolation from start to end
+	if constexpr (!(Type & DS_SPRITE))
+	{
+		float planelightfloat = PLANELIGHTFLOAT;
+		float lightstart, lightend;
+
+		lightend = (iz + ds->szp.x*width) * planelightfloat;
+		lightstart = iz * planelightfloat;
+
+		R_CalcTiltedLighting(tiltlighting, ds->x1, ds->x2, FLOAT_TO_FIXED(lightstart), FLOAT_TO_FIXED(lightend));
+		//CONS_Printf("tilted lighting %f to %f (foc %f)\n", lightstart, lightend, focallengthf);
+	}
+
+	uz = ds->sup.z + ds->sup.y*(centery-ds->y) + ds->sup.x*(ds->x1-centerx);
+	vz = ds->svp.z + ds->svp.y*(centery-ds->y) + ds->svp.x*(ds->x1-centerx);
+
+	colormap = ds->colormap;
+
+	dest = ylookup[ds->y] + columnofs[ds->x1];
+
+	if constexpr (Type & DS_RIPPLE)
+	{
+		dsrc = screens[1] + (ds->y + ds->bgofs) * vid.width + ds->x1;
+	}
+	else
+	{
+		dsrc = dest;
+	}
+
+#if 0	// The "perfect" reference version of this routine. Pretty slow.
+		// Use it only to see how things are supposed to look.
+	i = 0;
+	do
+	{
+		double z = 1.f/iz;
+		u = (INT64)(uz*z);
+		v = (INT64)(vz*z);
+
+		if constexpr (!(Type & DS_SPRITE))
+		{
+			colormap = ds->planezlight[tiltlighting[ds->x1++]] + (ds->colormap - colormaps);
+		}
+
+		// Lactozilla: Non-powers-of-two
+		{
+			fixed_t x = (((fixed_t)u) >> FRACBITS);
+			fixed_t y = (((fixed_t)v) >> FRACBITS);
+
+			// Carefully align all of my Friends.
+			if (x < 0)
+				x += (libdivide_u32_do((UINT32)(-x-1), &x_divider) + 1) * ds_flatwidth;
+			else
+				x -= libdivide_u32_do((UINT32)x, &x_divider) * ds_flatwidth;
+			if (y < 0)
+				y += (libdivide_u32_do((UINT32)(-y-1), &y_divider) + 1) * ds_flatheight;
+			else
+				y -= libdivide_u32_do((UINT32)y, &y_divider) * ds_flatheight;
+
+			*dest = R_DrawSpanPixel<Type>(ds, dsrc, colormap, ((y * ds->flatwidth) + x));
+		}
+		dest++;
+		dsrc++;
+		iz += ds_szp->x;
+		uz += ds_sup->x;
+		vz += ds_svp->x;
+	} while (--width >= 0);
+#else
+	startz = 1.f/iz;
+	startu = uz*startz;
+	startv = vz*startz;
+
+	izstep = ds->szp.x * SPANSIZE;
+	uzstep = ds->sup.x * SPANSIZE;
+	vzstep = ds->svp.x * SPANSIZE;
+	//x1 = 0;
+	width++;
+
+	while (width >= SPANSIZE)
+	{
+		iz += izstep;
+		uz += uzstep;
+		vz += vzstep;
+
+		endz = 1.f/iz;
+		endu = uz*endz;
+		endv = vz*endz;
+		stepu = (INT64)((endu - startu) * INVSPAN);
+		stepv = (INT64)((endv - startv) * INVSPAN);
+		u = (INT64)(startu);
+		v = (INT64)(startv);
+
+		for (i = SPANSIZE-1; i >= 0; i--)
+		{
+			if constexpr (!(Type & DS_SPRITE))
+			{
+				colormap = ds->planezlight[tiltlighting[ds->x1++]] + (ds->colormap - colormaps);
+			}
+
+			// Lactozilla: Non-powers-of-two
+			{
+				fixed_t x = (((fixed_t)u) >> FRACBITS);
+				fixed_t y = (((fixed_t)v) >> FRACBITS);
+
+				// Carefully align all of my Friends.
+				if (x < 0)
+					x += (libdivide_u32_do((UINT32)(-x-1), &x_divider) + 1) * ds->flatwidth;
+				else
+					x -= libdivide_u32_do((UINT32)x, &x_divider) * ds->flatwidth;
+				if (y < 0)
+					y += (libdivide_u32_do((UINT32)(-y-1), &y_divider) + 1) * ds->flatheight;
+				else
+					y -= libdivide_u32_do((UINT32)y, &y_divider) * ds->flatheight;
+
+				*dest = R_DrawSpanPixel<Type>(ds, dsrc, colormap, ((y * ds->flatwidth) + x));
+			}
+			dest++;
+			dsrc++;
+			u += stepu;
+			v += stepv;
+		}
+		startu = endu;
+		startv = endv;
+		width -= SPANSIZE;
+	}
+	if (width > 0)
+	{
+		if (width == 1)
+		{
+			u = (INT64)(startu);
+			v = (INT64)(startv);
+
+			if constexpr (!(Type & DS_SPRITE))
+			{
+				colormap = ds->planezlight[tiltlighting[ds->x1++]] + (ds->colormap - colormaps);
+			}
+
+			// Lactozilla: Non-powers-of-two
+			{
+				fixed_t x = (((fixed_t)u) >> FRACBITS);
+				fixed_t y = (((fixed_t)v) >> FRACBITS);
+
+				// Carefully align all of my Friends.
+				if (x < 0)
+					x += (libdivide_u32_do((UINT32)(-x-1), &x_divider) + 1) * ds->flatwidth;
+				else
+					x -= libdivide_u32_do((UINT32)x, &x_divider) * ds->flatwidth;
+				if (y < 0)
+					y += (libdivide_u32_do((UINT32)(-y-1), &y_divider) + 1) * ds->flatheight;
+				else
+					y -= libdivide_u32_do((UINT32)y, &y_divider) * ds->flatheight;
+
+				*dest = R_DrawSpanPixel<Type>(ds, dsrc, colormap, ((y * ds->flatwidth) + x));
+			}
+		}
+		else
+		{
+			double left = width;
+			iz += ds->szp.x * left;
+			uz += ds->sup.x * left;
+			vz += ds->svp.x * left;
+
+			endz = 1.f/iz;
+			endu = uz*endz;
+			endv = vz*endz;
+			left = 1.f/left;
+			stepu = (INT64)((endu - startu) * left);
+			stepv = (INT64)((endv - startv) * left);
+			u = (INT64)(startu);
+			v = (INT64)(startv);
+
+			for (; width != 0; width--)
+			{
+				if constexpr (!(Type & DS_SPRITE))
+				{
+					colormap = ds->planezlight[tiltlighting[ds->x1++]] + (ds->colormap - colormaps);
+				}
+
+				// Lactozilla: Non-powers-of-two
+				{
+					fixed_t x = (((fixed_t)u) >> FRACBITS);
+					fixed_t y = (((fixed_t)v) >> FRACBITS);
+
+					// Carefully align all of my Friends.
+					if (x < 0)
+						x += (libdivide_u32_do((UINT32)(-x-1), &x_divider) + 1) * ds->flatwidth;
+					else
+						x -= libdivide_u32_do((UINT32)x, &x_divider) * ds->flatwidth;
+					if (y < 0)
+						y += (libdivide_u32_do((UINT32)(-y-1), &y_divider) + 1) * ds->flatheight;
+					else
+						y -= libdivide_u32_do((UINT32)y, &y_divider) * ds->flatheight;
+
+					*dest = R_DrawSpanPixel<Type>(ds, dsrc, colormap, ((y * ds->flatwidth) + x));
+				}
+				dest++;
+				dsrc++;
+				u += stepu;
+				v += stepv;
+			}
+		}
+	}
+#endif
+}
+
+#define DEFINE_SPAN_FUNC(name, flags, template) \
+	void name(drawspandata_t* ds) \
+	{ \
+		ZoneScoped; \
+		constexpr DrawSpanType opt = static_cast<DrawSpanType>(flags); \
+		template<opt>(ds); \
+	}
+
+#define DEFINE_SPAN_COMBO(name, flags) \
+	DEFINE_SPAN_FUNC(name, flags, R_DrawSpanTemplate) \
+	DEFINE_SPAN_FUNC(name ## _Tilted, flags, R_DrawTiltedSpanTemplate) \
+	DEFINE_SPAN_FUNC(name ## _NPO2, flags, R_DrawNPO2SpanTemplate) \
+	DEFINE_SPAN_FUNC(name ## _Tilted_NPO2, flags, R_DrawTiltedNPO2SpanTemplate) \
+	DEFINE_SPAN_FUNC(name ## _Brightmap, flags|DS_BRIGHTMAP, R_DrawSpanTemplate) \
+	DEFINE_SPAN_FUNC(name ## _Tilted_Brightmap, flags|DS_BRIGHTMAP, R_DrawTiltedSpanTemplate) \
+	DEFINE_SPAN_FUNC(name ## _Brightmap_NPO2, flags|DS_BRIGHTMAP, R_DrawNPO2SpanTemplate) \
+	DEFINE_SPAN_FUNC(name ## _Tilted_Brightmap_NPO2, flags|DS_BRIGHTMAP, R_DrawTiltedNPO2SpanTemplate)
+
+DEFINE_SPAN_COMBO(R_DrawSpan, DS_BASIC)
+DEFINE_SPAN_COMBO(R_DrawTranslucentSpan, DS_TRANSMAP)
+DEFINE_SPAN_COMBO(R_DrawSplat, DS_HOLES)
+DEFINE_SPAN_COMBO(R_DrawTranslucentSplat, DS_TRANSMAP|DS_HOLES)
+DEFINE_SPAN_COMBO(R_DrawFloorSprite, DS_COLORMAP|DS_SPRITE)
+DEFINE_SPAN_COMBO(R_DrawTranslucentFloorSprite, DS_COLORMAP|DS_TRANSMAP|DS_SPRITE)
+DEFINE_SPAN_COMBO(R_DrawTranslucentWaterSpan, DS_TRANSMAP|DS_RIPPLE)
+
+void R_DrawFogSpan(drawspandata_t* ds)
+{
+	ZoneScoped;
+
+	UINT8 *colormap;
+	UINT8 *dest;
+
+	size_t count;
+
+	colormap = ds->colormap;
+
+	//dest = ylookup[ds_y] + columnofs[ds_x1];
+	dest = &topleft[ds->y *vid.width + ds->x1];
+
+	count = ds->x2 - ds->x1 + 1;
+
+	while (count >= 4)
+	{
+		dest[0] = colormap[dest[0]];
+		dest[1] = colormap[dest[1]];
+		dest[2] = colormap[dest[2]];
+		dest[3] = colormap[dest[3]];
+
+		dest += 4;
+		count -= 4;
+	}
+
+	while (count--)
+	{
+		*dest = colormap[*dest];
+		dest++;
+	}
+}
+
+void R_DrawFogSpan_Tilted(drawspandata_t* ds)
+{
+	ZoneScoped;
+
+	// x1, x2 = ds_x1, ds_x2
+	int width = ds->x2 - ds->x1;
+	double iz = ds->szp.z + ds->szp.y*(centery-ds->y) + ds->szp.x*(ds->x1-centerx);
+	INT32 tiltlighting[MAXVIDWIDTH];
+
+	UINT8 *dest = ylookup[ds->y] + columnofs[ds->x1];
+
+	// Lighting is simple. It's just linear interpolation from start to end
+	{
+		float planelightfloat = PLANELIGHTFLOAT;
+		float lightstart, lightend;
+
+		lightend = (iz + ds->szp.x*width) * planelightfloat;
+		lightstart = iz * planelightfloat;
+
+		R_CalcTiltedLighting(tiltlighting, ds->x1, ds->x2, FLOAT_TO_FIXED(lightstart), FLOAT_TO_FIXED(lightend));
+		//CONS_Printf("tilted lighting %f to %f (foc %f)\n", lightstart, lightend, focallengthf);
+	}
+
+	do
+	{
+		UINT8 *colormap = ds->planezlight[tiltlighting[ds->x1++]] + (ds->colormap - colormaps);
+		*dest = colormap[*dest];
+		dest++;
+	}
+	while (--width >= 0);
+}
+
+void R_DrawSpan_Flat(drawspandata_t* ds)
+{
+	ZoneScoped;
+
+	UINT8 *dest = ylookup[ds->y] + columnofs[ds->x1];
+	memset(dest, ds->colormap[ds->r8_flatcolor], (ds->x2 - ds->x1) + 1);
+}
+
+void R_DrawTiltedSpan_Flat(drawspandata_t* ds)
+{
+	ZoneScoped;
+
+	// x1, x2 = ds_x1, ds_x2
+	int width = ds->x2 - ds->x1;
+	double iz = ds->szp.z + ds->szp.y*(centery-ds->y) + ds->szp.x*(ds->x1-centerx);
+	INT32 tiltlighting[MAXVIDWIDTH];
+
+	UINT8 *dest = ylookup[ds->y];
+
+	// Lighting is simple. It's just linear interpolation from start to end
+	{
+		float planelightfloat = PLANELIGHTFLOAT;
+		float lightstart, lightend;
+
+		lightend = (iz + ds->szp.x*width) * planelightfloat;
+		lightstart = iz * planelightfloat;
+
+		R_CalcTiltedLighting(tiltlighting, ds->x1, ds->x2, FLOAT_TO_FIXED(lightstart), FLOAT_TO_FIXED(lightend));
+		//CONS_Printf("tilted lighting %f to %f (foc %f)\n", lightstart, lightend, focallengthf);
+	}
+
+	while (ds->x1 <= ds->x2)
+	{
+		dest[ds->x1] = ds->planezlight[tiltlighting[ds->x1]][ds->r8_flatcolor];
+		ds->x1++;
+	}
+}
diff --git a/src/r_plane.cpp b/src/r_plane.cpp
index 36ba9d1b5..b85cafbf9 100644
--- a/src/r_plane.cpp
+++ b/src/r_plane.cpp
@@ -962,7 +962,7 @@ void R_DrawSinglePlane(drawspandata_t *ds, visplane_t *pl, boolean allow_paralle
 			{
 				dc.yl = pl->top[dc.x];
 				dc.yh = pl->bottom[dc.x];
-				R_DrawColumn_Flat_8(&dc);
+				R_DrawColumn_Flat(&dc);
 			}
 		}
 		else
@@ -1202,6 +1202,9 @@ void R_DrawSinglePlane(drawspandata_t *ds, visplane_t *pl, boolean allow_paralle
 			case SPANDRAWFUNC_SPLAT:
 				spanfunctype = SPANDRAWFUNC_TILTEDSPLAT;
 				break;
+			case SPANDRAWFUNC_FOG:
+				spanfunctype = SPANDRAWFUNC_TILTEDFOG;
+				break;
 			default:
 				spanfunctype = SPANDRAWFUNC_TILTED;
 				break;
@@ -1240,77 +1243,6 @@ void R_DrawSinglePlane(drawspandata_t *ds, visplane_t *pl, boolean allow_paralle
 
 	for (x = pl->minx; x <= stop; x++)
 		R_MakeSpans(mapfunc, spanfunc, ds, x, pl->top[x-1], pl->bottom[x-1], pl->top[x], pl->bottom[x], allow_parallel);
-
-/*
-QUINCUNX anti-aliasing technique (sort of)
-
-Normally, Quincunx antialiasing staggers pixels
-in a 5-die pattern like so:
-
-o   o
-  o
-o   o
-
-To simulate this, we offset the plane by
-FRACUNIT/4 in each direction, and draw
-at 50% translucency. The result is
-a 'smoothing' of the texture while
-using the palette colors.
-*/
-#ifdef QUINCUNX
-	if (spanfunc == spanfuncs[BASEDRAWFUNC])
-	{
-		INT32 i;
-		ds_transmap = R_GetTranslucencyTable(tr_trans50);
-		spanfunc = spanfuncs[SPANDRAWFUNC_TRANS];
-		for (i=0; i<4; i++)
-		{
-			xoffs = pl->xoffs;
-			yoffs = pl->yoffs;
-
-			switch(i)
-			{
-				case 0:
-					xoffs -= FRACUNIT/4;
-					yoffs -= FRACUNIT/4;
-					break;
-				case 1:
-					xoffs -= FRACUNIT/4;
-					yoffs += FRACUNIT/4;
-					break;
-				case 2:
-					xoffs += FRACUNIT/4;
-					yoffs -= FRACUNIT/4;
-					break;
-				case 3:
-					xoffs += FRACUNIT/4;
-					yoffs += FRACUNIT/4;
-					break;
-			}
-			ds->planeheight = abs(pl->height - pl->viewz);
-
-			if (light >= LIGHTLEVELS)
-				light = LIGHTLEVELS-1;
-
-			if (light < 0)
-				light = 0;
-
-			planezlight = zlight[light];
-
-			// set the maximum value for unsigned
-			pl->top[pl->maxx+1] = 0xffff;
-			pl->top[pl->minx-1] = 0xffff;
-			pl->bottom[pl->maxx+1] = 0x0000;
-			pl->bottom[pl->minx-1] = 0x0000;
-
-			stop = pl->maxx + 1;
-
-			for (x = pl->minx; x <= stop; x++)
-				R_MakeSpans(mapfunc, x, pl->top[x-1], pl->bottom[x-1],
-					pl->top[x], pl->bottom[x]);
-		}
-	}
-#endif
 }
 
 void R_PlaneBounds(visplane_t *plane)
diff --git a/src/r_segs.cpp b/src/r_segs.cpp
index c349ca712..b376605de 100644
--- a/src/r_segs.cpp
+++ b/src/r_segs.cpp
@@ -688,7 +688,7 @@ void R_RenderMaskedSegRange(drawseg_t *drawseg, INT32 x1, INT32 x2)
 
 	if (debug)
 	{
-		colfunc = R_DrawColumn_Flat_8;
+		colfunc = R_DrawColumn_Flat;
 		dc->r8_flatcolor = R_DebugLineColor(ldef);
 		R_RenderMaskedSegLoopDebug(dc, drawseg, x1, x2, colfunc_2s);
 	}
diff --git a/src/r_splats.c b/src/r_splats.c
index 0607b4832..0c2511b6e 100644
--- a/src/r_splats.c
+++ b/src/r_splats.c
@@ -30,20 +30,8 @@ static void prepare_rastertab(void);
 
 static void R_RasterizeFloorSplat(floorsplat_t *pSplat, vector2_t *verts, vissprite_t *vis);
 
-#ifdef USEASM
-void ASMCALL rasterize_segment_tex_asm(INT32 x1, INT32 y1, INT32 x2, INT32 y2, INT32 tv1, INT32 tv2, INT32 tc, INT32 dir);
-#endif
-
 static void rasterize_segment_tex(INT32 x1, INT32 y1, INT32 x2, INT32 y2, INT32 tv1, INT32 tv2, INT32 tc, INT32 dir)
 {
-#ifdef USEASM
-	if (R_ASM)
-	{
-		rasterize_segment_tex_asm(x1, y1, x2, y2, tv1, tv2, tc, dir);
-		return;
-	}
-	else
-#endif
 	{
 		fixed_t xs, xe, count;
 		fixed_t dx0, dx1;
diff --git a/src/screen.c b/src/screen.c
index 889af63cb..5d223a0fb 100644
--- a/src/screen.c
+++ b/src/screen.c
@@ -72,126 +72,116 @@ UINT8 *scr_borderpatch; // flat used to fill the reduced view borders set at ST_
 
 // =========================================================================
 
-//  Short and Tall sky drawer, for the current color mode
-void (*walldrawerfunc)(void);
-
-boolean R_ASM = true;
-boolean R_486 = false;
-boolean R_586 = false;
-boolean R_MMX = false;
-boolean R_SSE = false;
-boolean R_3DNow = false;
-boolean R_MMXExt = false;
-boolean R_SSE2 = false;
-
 void SCR_SetDrawFuncs(void)
 {
 	//
-	//  setup the right draw routines for either 8bpp or 16bpp
+	//  setup the right draw routines
 	//
-	if (true)//vid.bpp == 1) //Always run in 8bpp. todo: remove all 16bpp code?
-	{
-		colfuncs[BASEDRAWFUNC] = R_DrawColumn_8;
-		spanfuncs[BASEDRAWFUNC] = R_DrawSpan_8;
 
-		colfuncs[COLDRAWFUNC_FUZZY] = R_DrawTranslucentColumn_8;
-		colfuncs[COLDRAWFUNC_TRANS] = R_DrawTranslatedColumn_8;
-		colfuncs[COLDRAWFUNC_SHADE] = R_DrawShadeColumn_8;
-		colfuncs[COLDRAWFUNC_SHADOWED] = R_DrawColumnShadowed_8;
-		colfuncs[COLDRAWFUNC_TRANSTRANS] = R_DrawTranslatedTranslucentColumn_8;
-		colfuncs[COLDRAWFUNC_TWOSMULTIPATCH] = R_Draw2sMultiPatchColumn_8;
-		colfuncs[COLDRAWFUNC_TWOSMULTIPATCHTRANS] = R_Draw2sMultiPatchTranslucentColumn_8;
-		colfuncs[COLDRAWFUNC_FOG] = R_DrawFogColumn_8;
-		colfuncs[COLDRAWFUNC_DROPSHADOW] = R_DrawDropShadowColumn_8;
+	colfuncs[BASEDRAWFUNC] = R_DrawColumn;
+	colfuncs[COLDRAWFUNC_FUZZY] = R_DrawTranslucentColumn;
+	colfuncs[COLDRAWFUNC_TRANS] = R_DrawTranslatedColumn;
+	colfuncs[COLDRAWFUNC_SHADOWED] = R_DrawColumnShadowed;
+	colfuncs[COLDRAWFUNC_TRANSTRANS] = R_DrawTranslatedTranslucentColumn;
+	colfuncs[COLDRAWFUNC_TWOSMULTIPATCH] = R_Draw2sMultiPatchColumn;
+	colfuncs[COLDRAWFUNC_TWOSMULTIPATCHTRANS] = R_Draw2sMultiPatchTranslucentColumn;
+	colfuncs[COLDRAWFUNC_FOG] = R_DrawFogColumn;
+	colfuncs[COLDRAWFUNC_DROPSHADOW] = R_DrawDropShadowColumn;
 
-		spanfuncs[SPANDRAWFUNC_TRANS] = R_DrawTranslucentSpan_8;
-		spanfuncs[SPANDRAWFUNC_TILTED] = R_DrawTiltedSpan_8;
-		spanfuncs[SPANDRAWFUNC_TILTEDTRANS] = R_DrawTiltedTranslucentSpan_8;
-		spanfuncs[SPANDRAWFUNC_SPLAT] = R_DrawSplat_8;
-		spanfuncs[SPANDRAWFUNC_TRANSSPLAT] = R_DrawTranslucentSplat_8;
-		spanfuncs[SPANDRAWFUNC_TILTEDSPLAT] = R_DrawTiltedSplat_8;
-		spanfuncs[SPANDRAWFUNC_SPRITE] = R_DrawFloorSprite_8;
-		spanfuncs[SPANDRAWFUNC_TRANSSPRITE] = R_DrawTranslucentFloorSprite_8;
-		spanfuncs[SPANDRAWFUNC_TILTEDSPRITE] = R_DrawTiltedFloorSprite_8;
-		spanfuncs[SPANDRAWFUNC_TILTEDTRANSSPRITE] = R_DrawTiltedTranslucentFloorSprite_8;
-		spanfuncs[SPANDRAWFUNC_WATER] = R_DrawTranslucentWaterSpan_8;
-		spanfuncs[SPANDRAWFUNC_TILTEDWATER] = R_DrawTiltedTranslucentWaterSpan_8;
-		spanfuncs[SPANDRAWFUNC_FOG] = R_DrawFogSpan_8;
+	colfuncs_bm[BASEDRAWFUNC] = R_DrawColumn_Brightmap;
+	colfuncs_bm[COLDRAWFUNC_FUZZY] = R_DrawTranslucentColumn_Brightmap;
+	colfuncs_bm[COLDRAWFUNC_TRANS] = R_DrawTranslatedColumn_Brightmap;
+	colfuncs_bm[COLDRAWFUNC_SHADOWED] = R_DrawColumnShadowed_Brightmap;
+	colfuncs_bm[COLDRAWFUNC_TRANSTRANS] = R_DrawTranslatedTranslucentColumn_Brightmap;
+	colfuncs_bm[COLDRAWFUNC_TWOSMULTIPATCH] = R_Draw2sMultiPatchColumn_Brightmap;
+	colfuncs_bm[COLDRAWFUNC_TWOSMULTIPATCHTRANS] = R_Draw2sMultiPatchTranslucentColumn_Brightmap;
+	colfuncs_bm[COLDRAWFUNC_FOG] = NULL; // Not needed
+	colfuncs_bm[COLDRAWFUNC_DROPSHADOW] = NULL; // Not needed
 
-		// Lactozilla: Non-powers-of-two
-		spanfuncs_npo2[BASEDRAWFUNC] = R_DrawSpan_NPO2_8;
-		spanfuncs_npo2[SPANDRAWFUNC_TRANS] = R_DrawTranslucentSpan_NPO2_8;
-		spanfuncs_npo2[SPANDRAWFUNC_TILTED] = R_DrawTiltedSpan_NPO2_8;
-		spanfuncs_npo2[SPANDRAWFUNC_TILTEDTRANS] = R_DrawTiltedTranslucentSpan_NPO2_8;
-		spanfuncs_npo2[SPANDRAWFUNC_SPLAT] = R_DrawSplat_NPO2_8;
-		spanfuncs_npo2[SPANDRAWFUNC_TRANSSPLAT] = R_DrawTranslucentSplat_NPO2_8;
-		spanfuncs_npo2[SPANDRAWFUNC_TILTEDSPLAT] = R_DrawTiltedSplat_NPO2_8;
-		spanfuncs_npo2[SPANDRAWFUNC_SPRITE] = R_DrawFloorSprite_NPO2_8;
-		spanfuncs_npo2[SPANDRAWFUNC_TRANSSPRITE] = R_DrawTranslucentFloorSprite_NPO2_8;
-		spanfuncs_npo2[SPANDRAWFUNC_TILTEDSPRITE] = R_DrawTiltedFloorSprite_NPO2_8;
-		spanfuncs_npo2[SPANDRAWFUNC_TILTEDTRANSSPRITE] = R_DrawTiltedTranslucentFloorSprite_NPO2_8;
-		spanfuncs_npo2[SPANDRAWFUNC_WATER] = R_DrawTranslucentWaterSpan_NPO2_8;
-		spanfuncs_npo2[SPANDRAWFUNC_TILTEDWATER] = R_DrawTiltedTranslucentWaterSpan_NPO2_8;
-		spanfuncs_npo2[SPANDRAWFUNC_FOG] = NULL; // Not needed
+	spanfuncs[BASEDRAWFUNC] = R_DrawSpan;
+	spanfuncs[SPANDRAWFUNC_TRANS] = R_DrawTranslucentSpan;
+	spanfuncs[SPANDRAWFUNC_TILTED] = R_DrawSpan_Tilted;
+	spanfuncs[SPANDRAWFUNC_TILTEDTRANS] = R_DrawTranslucentSpan_Tilted;
+	spanfuncs[SPANDRAWFUNC_SPLAT] = R_DrawSplat;
+	spanfuncs[SPANDRAWFUNC_TRANSSPLAT] = R_DrawTranslucentSplat;
+	spanfuncs[SPANDRAWFUNC_TILTEDSPLAT] = R_DrawSplat_Tilted;
+	spanfuncs[SPANDRAWFUNC_SPRITE] = R_DrawFloorSprite;
+	spanfuncs[SPANDRAWFUNC_TRANSSPRITE] = R_DrawTranslucentFloorSprite;
+	spanfuncs[SPANDRAWFUNC_TILTEDSPRITE] = R_DrawFloorSprite_Tilted;
+	spanfuncs[SPANDRAWFUNC_TILTEDTRANSSPRITE] = R_DrawTranslucentFloorSprite_Tilted;
+	spanfuncs[SPANDRAWFUNC_WATER] = R_DrawTranslucentWaterSpan;
+	spanfuncs[SPANDRAWFUNC_TILTEDWATER] = R_DrawTranslucentWaterSpan_Tilted;
+	spanfuncs[SPANDRAWFUNC_FOG] = R_DrawFogSpan;
+	spanfuncs[SPANDRAWFUNC_TILTEDFOG] = R_DrawFogSpan_Tilted;
 
-		// Debugging - highlight surfaces in flat colors
-		spanfuncs_flat[BASEDRAWFUNC] = R_DrawSpan_Flat_8;
-		spanfuncs_flat[SPANDRAWFUNC_TRANS] = R_DrawSpan_Flat_8;
-		spanfuncs_flat[SPANDRAWFUNC_TILTED] = R_DrawTiltedSpan_Flat_8;
-		spanfuncs_flat[SPANDRAWFUNC_TILTEDTRANS] = R_DrawTiltedSpan_Flat_8;
-		spanfuncs_flat[SPANDRAWFUNC_SPLAT] = R_DrawSpan_Flat_8;
-		spanfuncs_flat[SPANDRAWFUNC_TRANSSPLAT] = R_DrawSpan_Flat_8;
-		spanfuncs_flat[SPANDRAWFUNC_TILTEDSPLAT] = R_DrawTiltedSpan_Flat_8;
-		spanfuncs_flat[SPANDRAWFUNC_SPRITE] = R_DrawSpan_Flat_8;
-		spanfuncs_flat[SPANDRAWFUNC_TRANSSPRITE] = R_DrawSpan_Flat_8;
-		spanfuncs_flat[SPANDRAWFUNC_TILTEDSPRITE] = R_DrawTiltedSpan_Flat_8;
-		spanfuncs_flat[SPANDRAWFUNC_TILTEDTRANSSPRITE] = R_DrawTiltedSpan_Flat_8;
-		spanfuncs_flat[SPANDRAWFUNC_WATER] = R_DrawSpan_Flat_8;
-		spanfuncs_flat[SPANDRAWFUNC_TILTEDWATER] = R_DrawTiltedSpan_Flat_8;
-		spanfuncs_flat[SPANDRAWFUNC_FOG] = R_DrawSpan_Flat_8; // Not needed
+	spanfuncs_bm[BASEDRAWFUNC] = R_DrawSpan_Brightmap;
+	spanfuncs_bm[SPANDRAWFUNC_TRANS] = R_DrawTranslucentSpan_Brightmap;
+	spanfuncs_bm[SPANDRAWFUNC_TILTED] = R_DrawSpan_Tilted_Brightmap;
+	spanfuncs_bm[SPANDRAWFUNC_TILTEDTRANS] = R_DrawTranslucentSpan_Tilted_Brightmap;
+	spanfuncs_bm[SPANDRAWFUNC_SPLAT] = R_DrawSplat_Brightmap;
+	spanfuncs_bm[SPANDRAWFUNC_TRANSSPLAT] = R_DrawTranslucentSplat_Brightmap;
+	spanfuncs_bm[SPANDRAWFUNC_TILTEDSPLAT] = R_DrawSplat_Tilted_Brightmap;
+	spanfuncs_bm[SPANDRAWFUNC_SPRITE] = R_DrawFloorSprite_Brightmap;
+	spanfuncs_bm[SPANDRAWFUNC_TRANSSPRITE] = R_DrawTranslucentFloorSprite_Brightmap;
+	spanfuncs_bm[SPANDRAWFUNC_TILTEDSPRITE] = R_DrawFloorSprite_Tilted_Brightmap;
+	spanfuncs_bm[SPANDRAWFUNC_TILTEDTRANSSPRITE] = R_DrawTranslucentFloorSprite_Tilted_Brightmap;
+	spanfuncs_bm[SPANDRAWFUNC_WATER] = R_DrawTranslucentWaterSpan_Brightmap;
+	spanfuncs_bm[SPANDRAWFUNC_TILTEDWATER] = R_DrawTranslucentWaterSpan_Tilted_Brightmap;
+	spanfuncs_bm[SPANDRAWFUNC_FOG] = NULL; // Not needed
+	spanfuncs_bm[SPANDRAWFUNC_TILTEDFOG] = NULL; // Not needed
 
-#if (defined(RUSEASM) && defined(USE_COL_SPAN_ASM))
-		if (R_ASM)
-		{
-			if (R_MMX)
-			{
-				colfuncs_asm[BASEDRAWFUNC] = R_DrawColumn_8_MMX;
-				//colfuncs_asm[COLDRAWFUNC_SHADE] = R_DrawShadeColumn_8_ASM;
-				//colfuncs_asm[COLDRAWFUNC_FUZZY] = R_DrawTranslucentColumn_8_ASM;
-				colfuncs_asm[COLDRAWFUNC_TWOSMULTIPATCH] = R_Draw2sMultiPatchColumn_8_MMX;
-				spanfuncs_asm[BASEDRAWFUNC] = R_DrawSpan_8_MMX;
-			}
-			else
-			{
-				colfuncs_asm[BASEDRAWFUNC] = R_DrawColumn_8_ASM;
-				//colfuncs_asm[COLDRAWFUNC_SHADE] = R_DrawShadeColumn_8_ASM;
-				//colfuncs_asm[COLDRAWFUNC_FUZZY] = R_DrawTranslucentColumn_8_ASM;
-				colfuncs_asm[COLDRAWFUNC_TWOSMULTIPATCH] = R_Draw2sMultiPatchColumn_8_ASM;
-			}
-		}
-#endif
+	// Lactozilla: Non-powers-of-two
+	spanfuncs_npo2[BASEDRAWFUNC] = R_DrawSpan_NPO2;
+	spanfuncs_npo2[SPANDRAWFUNC_TRANS] = R_DrawTranslucentSpan_NPO2;
+	spanfuncs_npo2[SPANDRAWFUNC_TILTED] = R_DrawSpan_Tilted_NPO2;
+	spanfuncs_npo2[SPANDRAWFUNC_TILTEDTRANS] = R_DrawTranslucentSpan_Tilted_NPO2;
+	spanfuncs_npo2[SPANDRAWFUNC_SPLAT] = R_DrawSplat_NPO2;
+	spanfuncs_npo2[SPANDRAWFUNC_TRANSSPLAT] = R_DrawTranslucentSplat_NPO2;
+	spanfuncs_npo2[SPANDRAWFUNC_TILTEDSPLAT] = R_DrawSplat_Tilted_NPO2;
+	spanfuncs_npo2[SPANDRAWFUNC_SPRITE] = R_DrawFloorSprite_NPO2;
+	spanfuncs_npo2[SPANDRAWFUNC_TRANSSPRITE] = R_DrawTranslucentFloorSprite_NPO2;
+	spanfuncs_npo2[SPANDRAWFUNC_TILTEDSPRITE] = R_DrawFloorSprite_Tilted_NPO2;
+	spanfuncs_npo2[SPANDRAWFUNC_TILTEDTRANSSPRITE] = R_DrawTranslucentFloorSprite_Tilted_NPO2;
+	spanfuncs_npo2[SPANDRAWFUNC_WATER] = R_DrawTranslucentWaterSpan_NPO2;
+	spanfuncs_npo2[SPANDRAWFUNC_TILTEDWATER] = R_DrawTranslucentWaterSpan_Tilted_NPO2;
+	spanfuncs_npo2[SPANDRAWFUNC_FOG] = NULL; // Not needed
+	spanfuncs_npo2[SPANDRAWFUNC_TILTEDFOG] = NULL; // Not needed
 
-		R_SetColumnFunc(BASEDRAWFUNC, false);
-		R_SetSpanFunc(BASEDRAWFUNC, false, false);
-	}
-/*	else if (vid.bpp > 1)
-	{
-		I_OutputMsg("using highcolor mode\n");
-		spanfunc = basespanfunc = R_DrawSpan_16;
-		transcolfunc = R_DrawTranslatedColumn_16;
-		transtransfunc = R_DrawTranslucentColumn_16; // No 16bit operation for this function
+	spanfuncs_bm_npo2[BASEDRAWFUNC] = R_DrawSpan_Brightmap_NPO2;
+	spanfuncs_bm_npo2[SPANDRAWFUNC_TRANS] = R_DrawTranslucentSpan_Brightmap_NPO2;
+	spanfuncs_bm_npo2[SPANDRAWFUNC_TILTED] = R_DrawSpan_Tilted_Brightmap_NPO2;
+	spanfuncs_bm_npo2[SPANDRAWFUNC_TILTEDTRANS] = R_DrawTranslucentSpan_Tilted_Brightmap_NPO2;
+	spanfuncs_bm_npo2[SPANDRAWFUNC_SPLAT] = R_DrawSplat_Brightmap_NPO2;
+	spanfuncs_bm_npo2[SPANDRAWFUNC_TRANSSPLAT] = R_DrawTranslucentSplat_Brightmap_NPO2;
+	spanfuncs_bm_npo2[SPANDRAWFUNC_TILTEDSPLAT] = R_DrawSplat_Tilted_Brightmap_NPO2;
+	spanfuncs_bm_npo2[SPANDRAWFUNC_SPRITE] = R_DrawFloorSprite_Brightmap_NPO2;
+	spanfuncs_bm_npo2[SPANDRAWFUNC_TRANSSPRITE] = R_DrawTranslucentFloorSprite_Brightmap_NPO2;
+	spanfuncs_bm_npo2[SPANDRAWFUNC_TILTEDSPRITE] = R_DrawFloorSprite_Tilted_Brightmap_NPO2;
+	spanfuncs_bm_npo2[SPANDRAWFUNC_TILTEDTRANSSPRITE] = R_DrawTranslucentFloorSprite_Tilted_Brightmap_NPO2;
+	spanfuncs_bm_npo2[SPANDRAWFUNC_WATER] = R_DrawTranslucentWaterSpan_Brightmap_NPO2;
+	spanfuncs_bm_npo2[SPANDRAWFUNC_TILTEDWATER] = R_DrawTranslucentWaterSpan_Tilted_Brightmap_NPO2;
+	spanfuncs_bm_npo2[SPANDRAWFUNC_FOG] = NULL; // Not needed
+	spanfuncs_bm_npo2[SPANDRAWFUNC_TILTEDFOG] = NULL; // Not needed
 
-		colfunc = basecolfunc = R_DrawColumn_16;
-		shadecolfunc = NULL; // detect error if used somewhere..
-		fuzzcolfunc = R_DrawTranslucentColumn_16;
-		walldrawerfunc = R_DrawWallColumn_16;
-	}*/
-	else
-		I_Error("unknown bytes per pixel mode %d\n", vid.bpp);
-/*
-	if (SCR_IsAspectCorrect(vid.width, vid.height))
-		CONS_Alert(CONS_WARNING, M_GetText("Resolution is not aspect-correct!\nUse a multiple of %dx%d\n"), BASEVIDWIDTH, BASEVIDHEIGHT);
-*/
+	// Debugging - highlight surfaces in flat colors
+	spanfuncs_flat[BASEDRAWFUNC] = R_DrawSpan_Flat;
+	spanfuncs_flat[SPANDRAWFUNC_TRANS] = R_DrawSpan_Flat;
+	spanfuncs_flat[SPANDRAWFUNC_TILTED] = R_DrawTiltedSpan_Flat;
+	spanfuncs_flat[SPANDRAWFUNC_TILTEDTRANS] = R_DrawTiltedSpan_Flat;
+	spanfuncs_flat[SPANDRAWFUNC_SPLAT] = R_DrawSpan_Flat;
+	spanfuncs_flat[SPANDRAWFUNC_TRANSSPLAT] = R_DrawSpan_Flat;
+	spanfuncs_flat[SPANDRAWFUNC_TILTEDSPLAT] = R_DrawTiltedSpan_Flat;
+	spanfuncs_flat[SPANDRAWFUNC_SPRITE] = R_DrawSpan_Flat;
+	spanfuncs_flat[SPANDRAWFUNC_TRANSSPRITE] = R_DrawSpan_Flat;
+	spanfuncs_flat[SPANDRAWFUNC_TILTEDSPRITE] = R_DrawTiltedSpan_Flat;
+	spanfuncs_flat[SPANDRAWFUNC_TILTEDTRANSSPRITE] = R_DrawTiltedSpan_Flat;
+	spanfuncs_flat[SPANDRAWFUNC_WATER] = R_DrawSpan_Flat;
+	spanfuncs_flat[SPANDRAWFUNC_TILTEDWATER] = R_DrawTiltedSpan_Flat;
+	spanfuncs_flat[SPANDRAWFUNC_FOG] = R_DrawSpan_Flat;
+	spanfuncs_flat[SPANDRAWFUNC_TILTEDFOG] = R_DrawTiltedSpan_Flat;
+
+	R_SetColumnFunc(BASEDRAWFUNC, false);
+	R_SetSpanFunc(BASEDRAWFUNC, false, false);
 }
 
 void R_SetColumnFunc(size_t id, boolean brightmapped)
@@ -202,14 +192,12 @@ void R_SetColumnFunc(size_t id, boolean brightmapped)
 
 	if (debugrender_highlight != 0)
 	{
-		colfunc = R_DrawColumn_Flat_8;
+		colfunc = R_DrawColumn_Flat;
 	}
-#ifdef USE_COL_SPAN_ASM
-	else if (colfuncs_asm[id] != NULL && brightmapped == false)
+	else if (brightmapped == true && colfuncs_bm[id] != NULL)
 	{
-		colfunc = colfuncs_asm[id];
+		colfunc = colfuncs_bm[id];
 	}
-#endif
 	else
 	{
 		colfunc = colfuncs[id];
@@ -225,19 +213,27 @@ void R_SetSpanFunc(size_t id, boolean npo2, boolean brightmapped)
 		return;
 	}
 
-	if (spanfuncs_npo2[id] != NULL && npo2 == true)
+	if (brightmapped == true && spanfuncs_bm[id] != NULL)
 	{
-		spanfunc = spanfuncs_npo2[id];
+		if (npo2 == true && spanfuncs_bm_npo2[id] != NULL)
+		{
+			spanfunc = spanfuncs_bm_npo2[id];
+		}
+		else
+		{
+			spanfunc = spanfuncs_bm[id];
+		}
 	}
-#ifdef USE_COL_SPAN_ASM
-	else if (spanfuncs_asm[id] != NULL && brightmapped == false)
-	{
-		spanfunc = spanfuncs_asm[id];
-	}
-#endif
 	else
 	{
-		spanfunc = spanfuncs[id];
+		if (npo2 == true && spanfuncs_npo2[id] != NULL)
+		{
+			spanfunc = spanfuncs_npo2[id];
+		}
+		else
+		{
+			spanfunc = spanfuncs[id];
+		}
 	}
 }
 
@@ -267,7 +263,7 @@ boolean R_CheckColumnFunc(size_t id)
 
 	for (i = 0; i < COLDRAWFUNC_MAX; i++)
 	{
-		if (colfunc == colfuncs[id] || colfunc == colfuncs_asm[id])
+		if (colfunc == colfuncs[id] || colfunc == colfuncs_bm[id])
 		{
 			return true;
 		}