src/util/bfloat.h


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69

/*
 * Copyright © 2025 Intel Corporation
 * SPDX-License-Identifier: MIT
 */

#pragma once

#include <assert.h>
#include <math.h>
#include <stdint.h>
#include "u_math.h"

/* When converting a Float NaN value to BFloat16 it is possible that the
 * significand bits that make the value a NaN will be rounded/truncated off
 * so ensure at least one significand bit is set.
 */
static inline uint16_t
_mesa_float_nan_to_bfloat_bits(union fi x)
{
   assert(isnan(x.f));
   return x.ui >> 16 | 1 << 6;
}

/* Round-towards-zero. */
static inline uint16_t
_mesa_float_to_bfloat16_bits_rtz(float f)
{
   union fi x;
   x.f = f;

   if (isnan(f))
      return _mesa_float_nan_to_bfloat_bits(x);

   return x.ui >> 16;
}

/* Round-to-nearest-even. */
static inline uint16_t
_mesa_float_to_bfloat16_bits_rte(float f)
{
   union fi x;
   x.f = f;

   if (isnan(f))
      return _mesa_float_nan_to_bfloat_bits(x);

   /* Use the tail part that is discarded to decide rounding,
    * break the tie with the nearest even.
    *
    * Overflow of the significand value will turn to zero and
    * increment the exponent.  If exponent reaches 0xff, the
    * value will correctly end up as +/- Inf.
    */
   uint32_t result = x.ui >> 16;
   const uint32_t tail = x.ui & 0xffff;
   if (tail > 0x8000 || (tail == 0x8000 && (result & 1) == 1))
      result++;

   return result;
}

static inline float
_mesa_bfloat16_bits_to_float(uint16_t bf)
{
   union fi x;
   x.ui = bf << 16;

   return x.f;
}