1/////////////////////////////////////////////////////////////////////////// 
2// 
3// Copyright (c) 2002, Industrial Light & Magic, a division of Lucas 
4// Digital Ltd. LLC 
5//  
6// All rights reserved. 
7//  
8// Redistribution and use in source and binary forms, with or without 
9// modification, are permitted provided that the following conditions are 
10// met: 
11// * Redistributions of source code must retain the above copyright 
12// notice, this list of conditions and the following disclaimer. 
13// * Redistributions in binary form must reproduce the above 
14// copyright notice, this list of conditions and the following disclaimer 
15// in the documentation and/or other materials provided with the 
16// distribution. 
17// * Neither the name of Industrial Light & Magic nor the names of 
18// its contributors may be used to endorse or promote products derived 
19// from this software without specific prior written permission.  
20//  
21// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 
22// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 
23// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 
24// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 
25// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 
26// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 
27// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 
28// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 
29// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 
30// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 
31// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
32// 
33/////////////////////////////////////////////////////////////////////////// 
34 
35// Primary authors: 
36// Florian Kainz <kainz@ilm.com> 
37// Rod Bogart <rgb@ilm.com> 
38 
39//--------------------------------------------------------------------------- 
40// 
41// half -- a 16-bit floating point number class: 
42// 
43// Type half can represent positive and negative numbers whose 
44// magnitude is between roughly 6.1e-5 and 6.5e+4 with a relative 
45// error of 9.8e-4; numbers smaller than 6.1e-5 can be represented 
46// with an absolute error of 6.0e-8. All integers from -2048 to 
47// +2048 can be represented exactly. 
48// 
49// Type half behaves (almost) like the built-in C++ floating point 
50// types. In arithmetic expressions, half, float and double can be 
51// mixed freely. Here are a few examples: 
52// 
53// half a (3.5); 
54// float b (a + sqrt (a)); 
55// a += b; 
56// b += a; 
57// b = a + 7; 
58// 
59// Conversions from half to float are lossless; all half numbers 
60// are exactly representable as floats. 
61// 
62// Conversions from float to half may not preserve a float's value 
63// exactly. If a float is not representable as a half, then the 
64// float value is rounded to the nearest representable half. If a 
65// float value is exactly in the middle between the two closest 
66// representable half values, then the float value is rounded to 
67// the closest half whose least significant bit is zero. 
68// 
69// Overflows during float-to-half conversions cause arithmetic 
70// exceptions. An overflow occurs when the float value to be 
71// converted is too large to be represented as a half, or if the 
72// float value is an infinity or a NAN. 
73// 
74// The implementation of type half makes the following assumptions 
75// about the implementation of the built-in C++ types: 
76// 
77// float is an IEEE 754 single-precision number 
78// sizeof (float) == 4 
79// sizeof (unsigned int) == sizeof (float) 
80// alignof (unsigned int) == alignof (float) 
81// sizeof (unsigned short) == 2 
82// 
83//--------------------------------------------------------------------------- 
84 
85#ifndef _HALF_H_ 
86#define _HALF_H_ 
87 
88#include "halfExport.h" // for definition of HALF_EXPORT 
89#include <iostream> 
90 
91class half 
92
93 public
94 
95 //------------- 
96 // Constructors 
97 //------------- 
98 
99 half () = default; // no initialization 
100 half (float f); 
101 // rule of 5 
102 ~half () = default
103 half (const half &) = default
104 half (half &&) noexcept = default
105 
106 //-------------------- 
107 // Conversion to float 
108 //-------------------- 
109 
110 operator float () const
111 
112 
113 //------------ 
114 // Unary minus 
115 //------------ 
116 
117 half operator - () const
118 
119 
120 //----------- 
121 // Assignment 
122 //----------- 
123 
124 half & operator = (const half &h) = default
125 half & operator = (half &&h) noexcept = default
126 half & operator = (float f); 
127 
128 half & operator += (half h); 
129 half & operator += (float f); 
130 
131 half & operator -= (half h); 
132 half & operator -= (float f); 
133 
134 half & operator *= (half h); 
135 half & operator *= (float f); 
136 
137 half & operator /= (half h); 
138 half & operator /= (float f); 
139 
140 
141 //--------------------------------------------------------- 
142 // Round to n-bit precision (n should be between 0 and 10). 
143 // After rounding, the significand's 10-n least significant 
144 // bits will be zero. 
145 //--------------------------------------------------------- 
146 
147 half round (unsigned int n) const
148 
149 
150 //-------------------------------------------------------------------- 
151 // Classification: 
152 // 
153 // h.isFinite() returns true if h is a normalized number, 
154 // a denormalized number or zero 
155 // 
156 // h.isNormalized() returns true if h is a normalized number 
157 // 
158 // h.isDenormalized() returns true if h is a denormalized number 
159 // 
160 // h.isZero() returns true if h is zero 
161 // 
162 // h.isNan() returns true if h is a NAN 
163 // 
164 // h.isInfinity() returns true if h is a positive 
165 // or a negative infinity 
166 // 
167 // h.isNegative() returns true if the sign bit of h 
168 // is set (negative) 
169 //-------------------------------------------------------------------- 
170 
171 bool isFinite () const
172 bool isNormalized () const
173 bool isDenormalized () const
174 bool isZero () const
175 bool isNan () const
176 bool isInfinity () const
177 bool isNegative () const
178 
179 
180 //-------------------------------------------- 
181 // Special values 
182 // 
183 // posInf() returns +infinity 
184 // 
185 // negInf() returns -infinity 
186 // 
187 // qNan() returns a NAN with the bit 
188 // pattern 0111111111111111 
189 // 
190 // sNan() returns a NAN with the bit 
191 // pattern 0111110111111111 
192 //-------------------------------------------- 
193 
194 static half posInf (); 
195 static half negInf (); 
196 static half qNan (); 
197 static half sNan (); 
198 
199 
200 //-------------------------------------- 
201 // Access to the internal representation 
202 //-------------------------------------- 
203 
204 HALF_EXPORT unsigned short bits () const
205 HALF_EXPORT void setBits (unsigned short bits); 
206 
207 
208 public
209 
210 union uif 
211
212 unsigned int i
213 float f
214 }; 
215 
216 private
217 
218 HALF_EXPORT static short convert (int i); 
219 HALF_EXPORT static float overflow (); 
220 
221 unsigned short _h
222 
223 HALF_EXPORT static const uif _toFloat[1 << 16]; 
224 HALF_EXPORT static const unsigned short _eLut[1 << 9]; 
225}; 
226 
227 
228 
229//----------- 
230// Stream I/O 
231//----------- 
232 
233HALF_EXPORT std::ostream & operator << (std::ostream &os, half h); 
234HALF_EXPORT std::istream & operator >> (std::istream &is, half &h); 
235 
236 
237//---------- 
238// Debugging 
239//---------- 
240 
241HALF_EXPORT void printBits (std::ostream &os, half h); 
242HALF_EXPORT void printBits (std::ostream &os, float f); 
243HALF_EXPORT void printBits (char c[19], half h); 
244HALF_EXPORT void printBits (char c[35], float f); 
245 
246 
247//------------------------------------------------------------------------- 
248// Limits 
249// 
250// Visual C++ will complain if HALF_MIN, HALF_NRM_MIN etc. are not float 
251// constants, but at least one other compiler (gcc 2.96) produces incorrect 
252// results if they are. 
253//------------------------------------------------------------------------- 
254 
255#if (defined _WIN32 || defined _WIN64) && defined _MSC_VER 
256 
257 #define HALF_MIN 5.96046448e-08f // Smallest positive half 
258 
259 #define HALF_NRM_MIN 6.10351562e-05f // Smallest positive normalized half 
260 
261 #define HALF_MAX 65504.0f // Largest positive half 
262 
263 #define HALF_EPSILON 0.00097656f // Smallest positive e for which 
264 // half (1.0 + e) != half (1.0) 
265#else 
266 
267 #define HALF_MIN 5.96046448e-08 // Smallest positive half 
268 
269 #define HALF_NRM_MIN 6.10351562e-05 // Smallest positive normalized half 
270 
271 #define HALF_MAX 65504.0 // Largest positive half 
272 
273 #define HALF_EPSILON 0.00097656 // Smallest positive e for which 
274 // half (1.0 + e) != half (1.0) 
275#endif 
276 
277 
278#define HALF_MANT_DIG 11 // Number of digits in mantissa 
279 // (significand + hidden leading 1) 
280 
281//  
282// floor( (HALF_MANT_DIG - 1) * log10(2) ) => 3.01... -> 3 
283#define HALF_DIG 3 // Number of base 10 digits that 
284 // can be represented without change 
285 
286// ceil(HALF_MANT_DIG * log10(2) + 1) => 4.31... -> 5 
287#define HALF_DECIMAL_DIG 5 // Number of base-10 digits that are 
288 // necessary to uniquely represent all 
289 // distinct values 
290 
291#define HALF_RADIX 2 // Base of the exponent 
292 
293#define HALF_MIN_EXP -13 // Minimum negative integer such that 
294 // HALF_RADIX raised to the power of 
295 // one less than that integer is a 
296 // normalized half 
297 
298#define HALF_MAX_EXP 16 // Maximum positive integer such that 
299 // HALF_RADIX raised to the power of 
300 // one less than that integer is a 
301 // normalized half 
302 
303#define HALF_MIN_10_EXP -4 // Minimum positive integer such 
304 // that 10 raised to that power is 
305 // a normalized half 
306 
307#define HALF_MAX_10_EXP 4 // Maximum positive integer such 
308 // that 10 raised to that power is 
309 // a normalized half 
310 
311 
312//--------------------------------------------------------------------------- 
313// 
314// Implementation -- 
315// 
316// Representation of a float: 
317// 
318// We assume that a float, f, is an IEEE 754 single-precision 
319// floating point number, whose bits are arranged as follows: 
320// 
321// 31 (msb) 
322// |  
323// | 30 23 
324// | | |  
325// | | | 22 0 (lsb) 
326// | | | | | 
327// X XXXXXXXX XXXXXXXXXXXXXXXXXXXXXXX 
328// 
329// s e m 
330// 
331// S is the sign-bit, e is the exponent and m is the significand. 
332// 
333// If e is between 1 and 254, f is a normalized number: 
334// 
335// s e-127 
336// f = (-1) * 2 * 1.m 
337// 
338// If e is 0, and m is not zero, f is a denormalized number: 
339// 
340// s -126 
341// f = (-1) * 2 * 0.m 
342// 
343// If e and m are both zero, f is zero: 
344// 
345// f = 0.0 
346// 
347// If e is 255, f is an "infinity" or "not a number" (NAN), 
348// depending on whether m is zero or not. 
349// 
350// Examples: 
351// 
352// 0 00000000 00000000000000000000000 = 0.0 
353// 0 01111110 00000000000000000000000 = 0.5 
354// 0 01111111 00000000000000000000000 = 1.0 
355// 0 10000000 00000000000000000000000 = 2.0 
356// 0 10000000 10000000000000000000000 = 3.0 
357// 1 10000101 11110000010000000000000 = -124.0625 
358// 0 11111111 00000000000000000000000 = +infinity 
359// 1 11111111 00000000000000000000000 = -infinity 
360// 0 11111111 10000000000000000000000 = NAN 
361// 1 11111111 11111111111111111111111 = NAN 
362// 
363// Representation of a half: 
364// 
365// Here is the bit-layout for a half number, h: 
366// 
367// 15 (msb) 
368// |  
369// | 14 10 
370// | | | 
371// | | | 9 0 (lsb) 
372// | | | | | 
373// X XXXXX XXXXXXXXXX 
374// 
375// s e m 
376// 
377// S is the sign-bit, e is the exponent and m is the significand. 
378// 
379// If e is between 1 and 30, h is a normalized number: 
380// 
381// s e-15 
382// h = (-1) * 2 * 1.m 
383// 
384// If e is 0, and m is not zero, h is a denormalized number: 
385// 
386// S -14 
387// h = (-1) * 2 * 0.m 
388// 
389// If e and m are both zero, h is zero: 
390// 
391// h = 0.0 
392// 
393// If e is 31, h is an "infinity" or "not a number" (NAN), 
394// depending on whether m is zero or not. 
395// 
396// Examples: 
397// 
398// 0 00000 0000000000 = 0.0 
399// 0 01110 0000000000 = 0.5 
400// 0 01111 0000000000 = 1.0 
401// 0 10000 0000000000 = 2.0 
402// 0 10000 1000000000 = 3.0 
403// 1 10101 1111000001 = -124.0625 
404// 0 11111 0000000000 = +infinity 
405// 1 11111 0000000000 = -infinity 
406// 0 11111 1000000000 = NAN 
407// 1 11111 1111111111 = NAN 
408// 
409// Conversion: 
410// 
411// Converting from a float to a half requires some non-trivial bit 
412// manipulations. In some cases, this makes conversion relatively 
413// slow, but the most common case is accelerated via table lookups. 
414// 
415// Converting back from a half to a float is easier because we don't 
416// have to do any rounding. In addition, there are only 65536 
417// different half numbers; we can convert each of those numbers once 
418// and store the results in a table. Later, all conversions can be 
419// done using only simple table lookups. 
420// 
421//--------------------------------------------------------------------------- 
422 
423 
424//---------------------------- 
425// Half-from-float constructor 
426//---------------------------- 
427 
428inline 
429half::half (float f
430
431 uif x
432 
433 x.f = f
434 
435 if (f == 0
436
437 // 
438 // Common special case - zero. 
439 // Preserve the zero's sign bit. 
440 // 
441 
442 _h = (x.i >> 16); 
443
444 else 
445
446 // 
447 // We extract the combined sign and exponent, e, from our 
448 // floating-point number, f. Then we convert e to the sign 
449 // and exponent of the half number via a table lookup. 
450 // 
451 // For the most common case, where a normalized half is produced, 
452 // the table lookup returns a non-zero value; in this case, all 
453 // we have to do is round f's significand to 10 bits and combine 
454 // the result with e. 
455 // 
456 // For all other cases (overflow, zeroes, denormalized numbers 
457 // resulting from underflow, infinities and NANs), the table 
458 // lookup returns zero, and we call a longer, non-inline function 
459 // to do the float-to-half conversion. 
460 // 
461 
462 int e = (x.i >> 23) & 0x000001ff
463 
464 e = _eLut[e]; 
465 
466 if (e
467
468 // 
469 // Simple case - round the significand, m, to 10 
470 // bits and combine it with the sign and exponent. 
471 // 
472 
473 int m = x.i & 0x007fffff
474 _h = e + ((m + 0x00000fff + ((m >> 13) & 1)) >> 13); 
475
476 else 
477
478 // 
479 // Difficult case - call a function. 
480 // 
481 
482 _h = convert (x.i); 
483
484
485
486 
487 
488//------------------------------------------ 
489// Half-to-float conversion via table lookup 
490//------------------------------------------ 
491 
492inline 
493half::operator float () const 
494
495 return _toFloat[_h].f
496
497 
498 
499//------------------------- 
500// Round to n-bit precision 
501//------------------------- 
502 
503inline half 
504half::round (unsigned int n) const 
505
506 // 
507 // Parameter check. 
508 // 
509 
510 if (n >= 10
511 return *this
512 
513 // 
514 // Disassemble h into the sign, s, 
515 // and the combined exponent and significand, e. 
516 // 
517 
518 unsigned short s = _h & 0x8000
519 unsigned short e = _h & 0x7fff
520 
521 // 
522 // Round the exponent and significand to the nearest value 
523 // where ones occur only in the (10-n) most significant bits. 
524 // Note that the exponent adjusts automatically if rounding 
525 // up causes the significand to overflow. 
526 // 
527 
528 e >>= 9 - n
529 e += e & 1
530 e <<= 9 - n
531 
532 // 
533 // Check for exponent overflow. 
534 // 
535 
536 if (e >= 0x7c00
537
538 // 
539 // Overflow occurred -- truncate instead of rounding. 
540 // 
541 
542 e = _h
543 e >>= 10 - n
544 e <<= 10 - n
545
546 
547 // 
548 // Put the original sign bit back. 
549 // 
550 
551 half h
552 h._h = s | e
553 
554 return h
555
556 
557 
558//----------------------- 
559// Other inline functions 
560//----------------------- 
561 
562inline half  
563half::operator - () const 
564
565 half h
566 h._h = _h ^ 0x8000
567 return h
568
569 
570 
571inline half
572half::operator = (float f
573
574 *this = half (f); 
575 return *this
576
577 
578 
579inline half
580half::operator += (half h
581
582 *this = half (float (*this) + float (h)); 
583 return *this
584
585 
586 
587inline half
588half::operator += (float f
589
590 *this = half (float (*this) + f); 
591 return *this
592
593 
594 
595inline half
596half::operator -= (half h
597
598 *this = half (float (*this) - float (h)); 
599 return *this
600
601 
602 
603inline half
604half::operator -= (float f
605
606 *this = half (float (*this) - f); 
607 return *this
608
609 
610 
611inline half
612half::operator *= (half h
613
614 *this = half (float (*this) * float (h)); 
615 return *this
616
617 
618 
619inline half
620half::operator *= (float f
621
622 *this = half (float (*this) * f); 
623 return *this
624
625 
626 
627inline half
628half::operator /= (half h
629
630 *this = half (float (*this) / float (h)); 
631 return *this
632
633 
634 
635inline half
636half::operator /= (float f
637
638 *this = half (float (*this) / f); 
639 return *this
640
641 
642 
643inline bool  
644half::isFinite () const 
645
646 unsigned short e = (_h >> 10) & 0x001f
647 return e < 31
648
649 
650 
651inline bool 
652half::isNormalized () const 
653
654 unsigned short e = (_h >> 10) & 0x001f
655 return e > 0 && e < 31
656
657 
658 
659inline bool 
660half::isDenormalized () const 
661
662 unsigned short e = (_h >> 10) & 0x001f
663 unsigned short m = _h & 0x3ff
664 return e == 0 && m != 0
665
666 
667 
668inline bool 
669half::isZero () const 
670
671 return (_h & 0x7fff) == 0
672
673 
674 
675inline bool 
676half::isNan () const 
677
678 unsigned short e = (_h >> 10) & 0x001f
679 unsigned short m = _h & 0x3ff
680 return e == 31 && m != 0
681
682 
683 
684inline bool 
685half::isInfinity () const 
686
687 unsigned short e = (_h >> 10) & 0x001f
688 unsigned short m = _h & 0x3ff
689 return e == 31 && m == 0
690
691 
692 
693inline bool  
694half::isNegative () const 
695
696 return (_h & 0x8000) != 0
697
698 
699 
700inline half 
701half::posInf () 
702
703 half h
704 h._h = 0x7c00
705 return h
706
707 
708 
709inline half 
710half::negInf () 
711
712 half h
713 h._h = 0xfc00
714 return h
715
716 
717 
718inline half 
719half::qNan () 
720
721 half h
722 h._h = 0x7fff
723 return h
724
725 
726 
727inline half 
728half::sNan () 
729
730 half h
731 h._h = 0x7dff
732 return h
733
734 
735 
736inline unsigned short 
737half::bits () const 
738
739 return _h
740
741 
742 
743inline void 
744half::setBits (unsigned short bits
745
746 _h = bits
747
748 
749#endif 
750