2 ===============================================================================
4 This C source file is part of the SoftFloat IEC/IEEE Floating-point
5 Arithmetic Package, Release 2.
7 Written by John R. Hauser. This work was made possible in part by the
8 International Computer Science Institute, located at Suite 600, 1947 Center
9 Street, Berkeley, California 94704. Funding was partially provided by the
10 National Science Foundation under grant MIP-9311980. The original version
11 of this code was written as part of a project to build a fixed-point vector
12 processor in collaboration with the University of California at Berkeley,
13 overseen by Profs. Nelson Morgan and John Wawrzynek. More information
14 is available through the web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
15 arithmetic/softfloat.html'.
17 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort
18 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
19 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO
20 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
21 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
23 Derivative works are acceptable, even for commercial purposes, so long as
24 (1) they include prominent notice that the work is derivative, and (2) they
25 include prominent notice akin to these three paragraphs for those parts of
26 this code that are retained.
28 ===============================================================================
33 #include "softfloat.h"
36 -------------------------------------------------------------------------------
37 Floating-point rounding mode, extended double-precision rounding precision,
39 -------------------------------------------------------------------------------
41 int8 float_rounding_mode = float_round_nearest_even;
42 int8 floatx80_rounding_precision = 80;
43 int8 float_exception_flags;
46 -------------------------------------------------------------------------------
47 Primitive arithmetic functions, including multi-word arithmetic, and
48 division and square root approximations. (Can be specialized to target if
50 -------------------------------------------------------------------------------
52 #include "softfloat-macros"
55 -------------------------------------------------------------------------------
56 Functions and definitions to determine: (1) whether tininess for underflow
57 is detected before or after rounding by default, (2) what (if anything)
58 happens when exceptions are raised, (3) how signaling NaNs are distinguished
59 from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
60 are propagated from function inputs to output. These details are target-
62 -------------------------------------------------------------------------------
64 #include "softfloat-specialize"
67 -------------------------------------------------------------------------------
68 Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
69 and 7, and returns the properly rounded 32-bit integer corresponding to the
70 input. If `zSign' is nonzero, the input is negated before being converted
71 to an integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point
72 input is simply rounded to an integer, with the inexact exception raised if
73 the input cannot be represented exactly as an integer. If the fixed-point
74 input is too large, however, the invalid exception is raised and the largest
75 positive or negative integer is returned.
76 -------------------------------------------------------------------------------
78 static int32 roundAndPackInt32( flag zSign, bits64 absZ )
81 flag roundNearestEven;
82 int8 roundIncrement, roundBits;
85 roundingMode = float_rounding_mode;
86 roundNearestEven = ( roundingMode == float_round_nearest_even );
87 roundIncrement = 0x40;
88 if ( ! roundNearestEven ) {
89 if ( roundingMode == float_round_to_zero ) {
93 roundIncrement = 0x7F;
95 if ( roundingMode == float_round_up ) roundIncrement = 0;
98 if ( roundingMode == float_round_down ) roundIncrement = 0;
102 roundBits = absZ & 0x7F;
103 absZ = ( absZ + roundIncrement )>>7;
104 absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
106 if ( zSign ) z = - z;
107 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
108 float_exception_flags |= float_flag_invalid;
109 return zSign ? 0x80000000 : 0x7FFFFFFF;
111 if ( roundBits ) float_exception_flags |= float_flag_inexact;
117 -------------------------------------------------------------------------------
118 Returns the fraction bits of the single-precision floating-point value `a'.
119 -------------------------------------------------------------------------------
121 INLINE bits32 extractFloat32Frac( float32 a )
124 return a & 0x007FFFFF;
129 -------------------------------------------------------------------------------
130 Returns the exponent bits of the single-precision floating-point value `a'.
131 -------------------------------------------------------------------------------
133 INLINE int16 extractFloat32Exp( float32 a )
136 return ( a>>23 ) & 0xFF;
141 -------------------------------------------------------------------------------
142 Returns the sign bit of the single-precision floating-point value `a'.
143 -------------------------------------------------------------------------------
145 INLINE flag extractFloat32Sign( float32 a )
153 -------------------------------------------------------------------------------
154 Normalizes the subnormal single-precision floating-point value represented
155 by the denormalized significand `aSig'. The normalized exponent and
156 significand are stored at the locations pointed to by `zExpPtr' and
157 `zSigPtr', respectively.
158 -------------------------------------------------------------------------------
161 normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr )
165 shiftCount = countLeadingZeros32( aSig ) - 8;
166 *zSigPtr = aSig<<shiftCount;
167 *zExpPtr = 1 - shiftCount;
172 -------------------------------------------------------------------------------
173 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
174 single-precision floating-point value, returning the result. After being
175 shifted into the proper positions, the three fields are simply added
176 together to form the result. This means that any integer portion of `zSig'
177 will be added into the exponent. Since a properly normalized significand
178 will have an integer portion equal to 1, the `zExp' input should be 1 less
179 than the desired result exponent whenever `zSig' is a complete, normalized
181 -------------------------------------------------------------------------------
183 INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig )
185 return ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig;
189 -------------------------------------------------------------------------------
190 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
191 and significand `zSig', and returns the proper single-precision floating-
192 point value corresponding to the abstract input. Ordinarily, the abstract
193 value is simply rounded and packed into the single-precision format, with
194 the inexact exception raised if the abstract input cannot be represented
195 exactly. If the abstract value is too large, however, the overflow and
196 inexact exceptions are raised and an infinity or maximal finite value is
197 returned. If the abstract value is too small, the input value is rounded to
198 a subnormal number, and the underflow and inexact exceptions are raised if
199 the abstract input cannot be represented exactly as a subnormal single-
200 precision floating-point number.
201 The input significand `zSig' has its binary point between bits 30
202 and 29, which is 7 bits to the left of the usual location. This shifted
203 significand must be normalized or smaller. If `zSig' is not normalized,
204 `zExp' must be 0; in that case, the result returned is a subnormal number,
205 and it must not require rounding. In the usual case that `zSig' is
206 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
207 The handling of underflow and overflow follows the IEC/IEEE Standard for
208 Binary Floating-point Arithmetic.
209 -------------------------------------------------------------------------------
211 static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
214 flag roundNearestEven;
215 int8 roundIncrement, roundBits;
218 roundingMode = float_rounding_mode;
219 roundNearestEven = ( roundingMode == float_round_nearest_even );
220 roundIncrement = 0x40;
221 if ( ! roundNearestEven ) {
222 if ( roundingMode == float_round_to_zero ) {
226 roundIncrement = 0x7F;
228 if ( roundingMode == float_round_up ) roundIncrement = 0;
231 if ( roundingMode == float_round_down ) roundIncrement = 0;
235 roundBits = zSig & 0x7F;
236 if ( 0xFD <= (bits16) zExp ) {
238 || ( ( zExp == 0xFD )
239 && ( (sbits32) ( zSig + roundIncrement ) < 0 ) )
241 float_raise( float_flag_overflow | float_flag_inexact );
242 return packFloat32( zSign, 0xFF, 0 ) - ( roundIncrement == 0 );
246 ( float_detect_tininess == float_tininess_before_rounding )
248 || ( zSig + roundIncrement < 0x80000000 );
249 shift32RightJamming( zSig, - zExp, &zSig );
251 roundBits = zSig & 0x7F;
252 if ( isTiny && roundBits ) float_raise( float_flag_underflow );
255 if ( roundBits ) float_exception_flags |= float_flag_inexact;
256 zSig = ( zSig + roundIncrement )>>7;
257 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
258 if ( zSig == 0 ) zExp = 0;
259 return packFloat32( zSign, zExp, zSig );
264 -------------------------------------------------------------------------------
265 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
266 and significand `zSig', and returns the proper single-precision floating-
267 point value corresponding to the abstract input. This routine is just like
268 `roundAndPackFloat32' except that `zSig' does not have to be normalized in
269 any way. In all cases, `zExp' must be 1 less than the ``true'' floating-
271 -------------------------------------------------------------------------------
274 normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
278 shiftCount = countLeadingZeros32( zSig ) - 1;
279 return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount );
284 -------------------------------------------------------------------------------
285 Returns the fraction bits of the double-precision floating-point value `a'.
286 -------------------------------------------------------------------------------
288 INLINE bits64 extractFloat64Frac( float64 a )
291 return a & LIT64( 0x000FFFFFFFFFFFFF );
296 -------------------------------------------------------------------------------
297 Returns the exponent bits of the double-precision floating-point value `a'.
298 -------------------------------------------------------------------------------
300 INLINE int16 extractFloat64Exp( float64 a )
303 return ( a>>52 ) & 0x7FF;
308 -------------------------------------------------------------------------------
309 Returns the sign bit of the double-precision floating-point value `a'.
310 -------------------------------------------------------------------------------
312 INLINE flag extractFloat64Sign( float64 a )
320 -------------------------------------------------------------------------------
321 Normalizes the subnormal double-precision floating-point value represented
322 by the denormalized significand `aSig'. The normalized exponent and
323 significand are stored at the locations pointed to by `zExpPtr' and
324 `zSigPtr', respectively.
325 -------------------------------------------------------------------------------
328 normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr )
332 shiftCount = countLeadingZeros64( aSig ) - 11;
333 *zSigPtr = aSig<<shiftCount;
334 *zExpPtr = 1 - shiftCount;
339 -------------------------------------------------------------------------------
340 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
341 double-precision floating-point value, returning the result. After being
342 shifted into the proper positions, the three fields are simply added
343 together to form the result. This means that any integer portion of `zSig'
344 will be added into the exponent. Since a properly normalized significand
345 will have an integer portion equal to 1, the `zExp' input should be 1 less
346 than the desired result exponent whenever `zSig' is a complete, normalized
348 -------------------------------------------------------------------------------
350 INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig )
353 return ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<52 ) + zSig;
358 -------------------------------------------------------------------------------
359 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
360 and significand `zSig', and returns the proper double-precision floating-
361 point value corresponding to the abstract input. Ordinarily, the abstract
362 value is simply rounded and packed into the double-precision format, with
363 the inexact exception raised if the abstract input cannot be represented
364 exactly. If the abstract value is too large, however, the overflow and
365 inexact exceptions are raised and an infinity or maximal finite value is
366 returned. If the abstract value is too small, the input value is rounded to
367 a subnormal number, and the underflow and inexact exceptions are raised if
368 the abstract input cannot be represented exactly as a subnormal double-
369 precision floating-point number.
370 The input significand `zSig' has its binary point between bits 62
371 and 61, which is 10 bits to the left of the usual location. This shifted
372 significand must be normalized or smaller. If `zSig' is not normalized,
373 `zExp' must be 0; in that case, the result returned is a subnormal number,
374 and it must not require rounding. In the usual case that `zSig' is
375 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
376 The handling of underflow and overflow follows the IEC/IEEE Standard for
377 Binary Floating-point Arithmetic.
378 -------------------------------------------------------------------------------
380 static float64 roundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )
383 flag roundNearestEven;
384 int16 roundIncrement, roundBits;
387 roundingMode = float_rounding_mode;
388 roundNearestEven = ( roundingMode == float_round_nearest_even );
389 roundIncrement = 0x200;
390 if ( ! roundNearestEven ) {
391 if ( roundingMode == float_round_to_zero ) {
395 roundIncrement = 0x3FF;
397 if ( roundingMode == float_round_up ) roundIncrement = 0;
400 if ( roundingMode == float_round_down ) roundIncrement = 0;
404 roundBits = zSig & 0x3FF;
405 if ( 0x7FD <= (bits16) zExp ) {
406 if ( ( 0x7FD < zExp )
407 || ( ( zExp == 0x7FD )
408 && ( (sbits64) ( zSig + roundIncrement ) < 0 ) )
410 //register int lr = __builtin_return_address(0);
411 //printk("roundAndPackFloat64 called from 0x%08x\n",lr);
412 float_raise( float_flag_overflow | float_flag_inexact );
413 return packFloat64( zSign, 0x7FF, 0 ) - ( roundIncrement == 0 );
417 ( float_detect_tininess == float_tininess_before_rounding )
419 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
420 shift64RightJamming( zSig, - zExp, &zSig );
422 roundBits = zSig & 0x3FF;
423 if ( isTiny && roundBits ) float_raise( float_flag_underflow );
426 if ( roundBits ) float_exception_flags |= float_flag_inexact;
427 zSig = ( zSig + roundIncrement )>>10;
428 zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
429 if ( zSig == 0 ) zExp = 0;
430 return packFloat64( zSign, zExp, zSig );
435 -------------------------------------------------------------------------------
436 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
437 and significand `zSig', and returns the proper double-precision floating-
438 point value corresponding to the abstract input. This routine is just like
439 `roundAndPackFloat64' except that `zSig' does not have to be normalized in
440 any way. In all cases, `zExp' must be 1 less than the ``true'' floating-
442 -------------------------------------------------------------------------------
445 normalizeRoundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )
449 shiftCount = countLeadingZeros64( zSig ) - 1;
450 return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount );
457 -------------------------------------------------------------------------------
458 Returns the fraction bits of the extended double-precision floating-point
460 -------------------------------------------------------------------------------
462 INLINE bits64 extractFloatx80Frac( floatx80 a )
470 -------------------------------------------------------------------------------
471 Returns the exponent bits of the extended double-precision floating-point
473 -------------------------------------------------------------------------------
475 INLINE int32 extractFloatx80Exp( floatx80 a )
478 return a.high & 0x7FFF;
483 -------------------------------------------------------------------------------
484 Returns the sign bit of the extended double-precision floating-point value
486 -------------------------------------------------------------------------------
488 INLINE flag extractFloatx80Sign( floatx80 a )
496 -------------------------------------------------------------------------------
497 Normalizes the subnormal extended double-precision floating-point value
498 represented by the denormalized significand `aSig'. The normalized exponent
499 and significand are stored at the locations pointed to by `zExpPtr' and
500 `zSigPtr', respectively.
501 -------------------------------------------------------------------------------
504 normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr )
508 shiftCount = countLeadingZeros64( aSig );
509 *zSigPtr = aSig<<shiftCount;
510 *zExpPtr = 1 - shiftCount;
515 -------------------------------------------------------------------------------
516 Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
517 extended double-precision floating-point value, returning the result.
518 -------------------------------------------------------------------------------
520 INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig )
525 z.high = ( ( (bits16) zSign )<<15 ) + zExp;
531 -------------------------------------------------------------------------------
532 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
533 and extended significand formed by the concatenation of `zSig0' and `zSig1',
534 and returns the proper extended double-precision floating-point value
535 corresponding to the abstract input. Ordinarily, the abstract value is
536 rounded and packed into the extended double-precision format, with the
537 inexact exception raised if the abstract input cannot be represented
538 exactly. If the abstract value is too large, however, the overflow and
539 inexact exceptions are raised and an infinity or maximal finite value is
540 returned. If the abstract value is too small, the input value is rounded to
541 a subnormal number, and the underflow and inexact exceptions are raised if
542 the abstract input cannot be represented exactly as a subnormal extended
543 double-precision floating-point number.
544 If `roundingPrecision' is 32 or 64, the result is rounded to the same
545 number of bits as single or double precision, respectively. Otherwise, the
546 result is rounded to the full precision of the extended double-precision
548 The input significand must be normalized or smaller. If the input
549 significand is not normalized, `zExp' must be 0; in that case, the result
550 returned is a subnormal number, and it must not require rounding. The
551 handling of underflow and overflow follows the IEC/IEEE Standard for Binary
552 Floating-point Arithmetic.
553 -------------------------------------------------------------------------------
556 roundAndPackFloatx80(
557 int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
561 flag roundNearestEven, increment, isTiny;
562 int64 roundIncrement, roundMask, roundBits;
564 roundingMode = float_rounding_mode;
565 roundNearestEven = ( roundingMode == float_round_nearest_even );
566 if ( roundingPrecision == 80 ) goto precision80;
567 if ( roundingPrecision == 64 ) {
568 roundIncrement = LIT64( 0x0000000000000400 );
569 roundMask = LIT64( 0x00000000000007FF );
571 else if ( roundingPrecision == 32 ) {
572 roundIncrement = LIT64( 0x0000008000000000 );
573 roundMask = LIT64( 0x000000FFFFFFFFFF );
578 zSig0 |= ( zSig1 != 0 );
579 if ( ! roundNearestEven ) {
580 if ( roundingMode == float_round_to_zero ) {
584 roundIncrement = roundMask;
586 if ( roundingMode == float_round_up ) roundIncrement = 0;
589 if ( roundingMode == float_round_down ) roundIncrement = 0;
593 roundBits = zSig0 & roundMask;
594 if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
595 if ( ( 0x7FFE < zExp )
596 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
602 ( float_detect_tininess == float_tininess_before_rounding )
604 || ( zSig0 <= zSig0 + roundIncrement );
605 shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
607 roundBits = zSig0 & roundMask;
608 if ( isTiny && roundBits ) float_raise( float_flag_underflow );
609 if ( roundBits ) float_exception_flags |= float_flag_inexact;
610 zSig0 += roundIncrement;
611 if ( (sbits64) zSig0 < 0 ) zExp = 1;
612 roundIncrement = roundMask + 1;
613 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
614 roundMask |= roundIncrement;
616 zSig0 &= ~ roundMask;
617 return packFloatx80( zSign, zExp, zSig0 );
620 if ( roundBits ) float_exception_flags |= float_flag_inexact;
621 zSig0 += roundIncrement;
622 if ( zSig0 < roundIncrement ) {
624 zSig0 = LIT64( 0x8000000000000000 );
626 roundIncrement = roundMask + 1;
627 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
628 roundMask |= roundIncrement;
630 zSig0 &= ~ roundMask;
631 if ( zSig0 == 0 ) zExp = 0;
632 return packFloatx80( zSign, zExp, zSig0 );
634 increment = ( (sbits64) zSig1 < 0 );
635 if ( ! roundNearestEven ) {
636 if ( roundingMode == float_round_to_zero ) {
641 increment = ( roundingMode == float_round_down ) && zSig1;
644 increment = ( roundingMode == float_round_up ) && zSig1;
648 if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
649 if ( ( 0x7FFE < zExp )
650 || ( ( zExp == 0x7FFE )
651 && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
657 float_raise( float_flag_overflow | float_flag_inexact );
658 if ( ( roundingMode == float_round_to_zero )
659 || ( zSign && ( roundingMode == float_round_up ) )
660 || ( ! zSign && ( roundingMode == float_round_down ) )
662 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
664 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
668 ( float_detect_tininess == float_tininess_before_rounding )
671 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
672 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
674 if ( isTiny && zSig1 ) float_raise( float_flag_underflow );
675 if ( zSig1 ) float_exception_flags |= float_flag_inexact;
676 if ( roundNearestEven ) {
677 increment = ( (sbits64) zSig1 < 0 );
681 increment = ( roundingMode == float_round_down ) && zSig1;
684 increment = ( roundingMode == float_round_up ) && zSig1;
689 zSig0 &= ~ ( ( zSig1 + zSig1 == 0 ) & roundNearestEven );
690 if ( (sbits64) zSig0 < 0 ) zExp = 1;
692 return packFloatx80( zSign, zExp, zSig0 );
695 if ( zSig1 ) float_exception_flags |= float_flag_inexact;
700 zSig0 = LIT64( 0x8000000000000000 );
703 zSig0 &= ~ ( ( zSig1 + zSig1 == 0 ) & roundNearestEven );
707 if ( zSig0 == 0 ) zExp = 0;
710 return packFloatx80( zSign, zExp, zSig0 );
714 -------------------------------------------------------------------------------
715 Takes an abstract floating-point value having sign `zSign', exponent
716 `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
717 and returns the proper extended double-precision floating-point value
718 corresponding to the abstract input. This routine is just like
719 `roundAndPackFloatx80' except that the input significand does not have to be
721 -------------------------------------------------------------------------------
724 normalizeRoundAndPackFloatx80(
725 int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
735 shiftCount = countLeadingZeros64( zSig0 );
736 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
739 roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 );
746 -------------------------------------------------------------------------------
747 Returns the result of converting the 32-bit two's complement integer `a' to
748 the single-precision floating-point format. The conversion is performed
749 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
750 -------------------------------------------------------------------------------
752 float32 int32_to_float32( int32 a )
756 if ( a == 0 ) return 0;
757 if ( a == 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
759 return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a );
764 -------------------------------------------------------------------------------
765 Returns the result of converting the 32-bit two's complement integer `a' to
766 the double-precision floating-point format. The conversion is performed
767 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
768 -------------------------------------------------------------------------------
770 float64 int32_to_float64( int32 a )
777 if ( a == 0 ) return 0;
779 absA = aSign ? - a : a;
780 shiftCount = countLeadingZeros32( absA ) + 21;
782 return packFloat64( aSign, 0x432 - shiftCount, zSig<<shiftCount );
789 -------------------------------------------------------------------------------
790 Returns the result of converting the 32-bit two's complement integer `a'
791 to the extended double-precision floating-point format. The conversion
792 is performed according to the IEC/IEEE Standard for Binary Floating-point
794 -------------------------------------------------------------------------------
796 floatx80 int32_to_floatx80( int32 a )
803 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
805 absA = zSign ? - a : a;
806 shiftCount = countLeadingZeros32( absA ) + 32;
808 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
815 -------------------------------------------------------------------------------
816 Returns the result of converting the single-precision floating-point value
817 `a' to the 32-bit two's complement integer format. The conversion is
818 performed according to the IEC/IEEE Standard for Binary Floating-point
819 Arithmetic---which means in particular that the conversion is rounded
820 according to the current rounding mode. If `a' is a NaN, the largest
821 positive integer is returned. Otherwise, if the conversion overflows, the
822 largest integer with the same sign as `a' is returned.
823 -------------------------------------------------------------------------------
825 int32 float32_to_int32( float32 a )
828 int16 aExp, shiftCount;
832 aSig = extractFloat32Frac( a );
833 aExp = extractFloat32Exp( a );
834 aSign = extractFloat32Sign( a );
835 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
836 if ( aExp ) aSig |= 0x00800000;
837 shiftCount = 0xAF - aExp;
840 if ( 0 < shiftCount ) shift64RightJamming( zSig, shiftCount, &zSig );
841 return roundAndPackInt32( aSign, zSig );
846 -------------------------------------------------------------------------------
847 Returns the result of converting the single-precision floating-point value
848 `a' to the 32-bit two's complement integer format. The conversion is
849 performed according to the IEC/IEEE Standard for Binary Floating-point
850 Arithmetic, except that the conversion is always rounded toward zero. If
851 `a' is a NaN, the largest positive integer is returned. Otherwise, if the
852 conversion overflows, the largest integer with the same sign as `a' is
854 -------------------------------------------------------------------------------
856 int32 float32_to_int32_round_to_zero( float32 a )
859 int16 aExp, shiftCount;
863 aSig = extractFloat32Frac( a );
864 aExp = extractFloat32Exp( a );
865 aSign = extractFloat32Sign( a );
866 shiftCount = aExp - 0x9E;
867 if ( 0 <= shiftCount ) {
868 if ( a == 0xCF000000 ) return 0x80000000;
869 float_raise( float_flag_invalid );
870 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
873 else if ( aExp <= 0x7E ) {
874 if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
877 aSig = ( aSig | 0x00800000 )<<8;
878 z = aSig>>( - shiftCount );
879 if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {
880 float_exception_flags |= float_flag_inexact;
882 return aSign ? - z : z;
887 -------------------------------------------------------------------------------
888 Returns the result of converting the single-precision floating-point value
889 `a' to the double-precision floating-point format. The conversion is
890 performed according to the IEC/IEEE Standard for Binary Floating-point
892 -------------------------------------------------------------------------------
894 float64 float32_to_float64( float32 a )
900 aSig = extractFloat32Frac( a );
901 aExp = extractFloat32Exp( a );
902 aSign = extractFloat32Sign( a );
903 if ( aExp == 0xFF ) {
904 if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a ) );
905 return packFloat64( aSign, 0x7FF, 0 );
908 if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
909 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
912 return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 );
919 -------------------------------------------------------------------------------
920 Returns the result of converting the single-precision floating-point value
921 `a' to the extended double-precision floating-point format. The conversion
922 is performed according to the IEC/IEEE Standard for Binary Floating-point
924 -------------------------------------------------------------------------------
926 floatx80 float32_to_floatx80( float32 a )
932 aSig = extractFloat32Frac( a );
933 aExp = extractFloat32Exp( a );
934 aSign = extractFloat32Sign( a );
935 if ( aExp == 0xFF ) {
936 if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a ) );
937 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
940 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
941 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
944 return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 );
951 -------------------------------------------------------------------------------
952 Rounds the single-precision floating-point value `a' to an integer, and
953 returns the result as a single-precision floating-point value. The
954 operation is performed according to the IEC/IEEE Standard for Binary
955 Floating-point Arithmetic.
956 -------------------------------------------------------------------------------
958 float32 float32_round_to_int( float32 a )
962 bits32 lastBitMask, roundBitsMask;
966 aExp = extractFloat32Exp( a );
967 if ( 0x96 <= aExp ) {
968 if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
969 return propagateFloat32NaN( a, a );
973 if ( aExp <= 0x7E ) {
974 if ( (bits32) ( a<<1 ) == 0 ) return a;
975 float_exception_flags |= float_flag_inexact;
976 aSign = extractFloat32Sign( a );
977 switch ( float_rounding_mode ) {
978 case float_round_nearest_even:
979 if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
980 return packFloat32( aSign, 0x7F, 0 );
983 case float_round_down:
984 return aSign ? 0xBF800000 : 0;
986 return aSign ? 0x80000000 : 0x3F800000;
988 return packFloat32( aSign, 0, 0 );
991 lastBitMask <<= 0x96 - aExp;
992 roundBitsMask = lastBitMask - 1;
994 roundingMode = float_rounding_mode;
995 if ( roundingMode == float_round_nearest_even ) {
997 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
999 else if ( roundingMode != float_round_to_zero ) {
1000 if ( extractFloat32Sign( z ) ^ ( roundingMode == float_round_up ) ) {
1004 z &= ~ roundBitsMask;
1005 if ( z != a ) float_exception_flags |= float_flag_inexact;
1011 -------------------------------------------------------------------------------
1012 Returns the result of adding the absolute values of the single-precision
1013 floating-point values `a' and `b'. If `zSign' is true, the sum is negated
1014 before being returned. `zSign' is ignored if the result is a NaN. The
1015 addition is performed according to the IEC/IEEE Standard for Binary
1016 Floating-point Arithmetic.
1017 -------------------------------------------------------------------------------
1019 static float32 addFloat32Sigs( float32 a, float32 b, flag zSign )
1021 int16 aExp, bExp, zExp;
1022 bits32 aSig, bSig, zSig;
1025 aSig = extractFloat32Frac( a );
1026 aExp = extractFloat32Exp( a );
1027 bSig = extractFloat32Frac( b );
1028 bExp = extractFloat32Exp( b );
1029 expDiff = aExp - bExp;
1032 if ( 0 < expDiff ) {
1033 if ( aExp == 0xFF ) {
1034 if ( aSig ) return propagateFloat32NaN( a, b );
1043 shift32RightJamming( bSig, expDiff, &bSig );
1046 else if ( expDiff < 0 ) {
1047 if ( bExp == 0xFF ) {
1048 if ( bSig ) return propagateFloat32NaN( a, b );
1049 return packFloat32( zSign, 0xFF, 0 );
1057 shift32RightJamming( aSig, - expDiff, &aSig );
1061 if ( aExp == 0xFF ) {
1062 if ( aSig | bSig ) return propagateFloat32NaN( a, b );
1065 if ( aExp == 0 ) return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1066 zSig = 0x40000000 + aSig + bSig;
1071 zSig = ( aSig + bSig )<<1;
1073 if ( (sbits32) zSig < 0 ) {
1078 return roundAndPackFloat32( zSign, zExp, zSig );
1083 -------------------------------------------------------------------------------
1084 Returns the result of subtracting the absolute values of the single-
1085 precision floating-point values `a' and `b'. If `zSign' is true, the
1086 difference is negated before being returned. `zSign' is ignored if the
1087 result is a NaN. The subtraction is performed according to the IEC/IEEE
1088 Standard for Binary Floating-point Arithmetic.
1089 -------------------------------------------------------------------------------
1091 static float32 subFloat32Sigs( float32 a, float32 b, flag zSign )
1093 int16 aExp, bExp, zExp;
1094 bits32 aSig, bSig, zSig;
1097 aSig = extractFloat32Frac( a );
1098 aExp = extractFloat32Exp( a );
1099 bSig = extractFloat32Frac( b );
1100 bExp = extractFloat32Exp( b );
1101 expDiff = aExp - bExp;
1104 if ( 0 < expDiff ) goto aExpBigger;
1105 if ( expDiff < 0 ) goto bExpBigger;
1106 if ( aExp == 0xFF ) {
1107 if ( aSig | bSig ) return propagateFloat32NaN( a, b );
1108 float_raise( float_flag_invalid );
1109 return float32_default_nan;
1115 if ( bSig < aSig ) goto aBigger;
1116 if ( aSig < bSig ) goto bBigger;
1117 return packFloat32( float_rounding_mode == float_round_down, 0, 0 );
1119 if ( bExp == 0xFF ) {
1120 if ( bSig ) return propagateFloat32NaN( a, b );
1121 return packFloat32( zSign ^ 1, 0xFF, 0 );
1129 shift32RightJamming( aSig, - expDiff, &aSig );
1135 goto normalizeRoundAndPack;
1137 if ( aExp == 0xFF ) {
1138 if ( aSig ) return propagateFloat32NaN( a, b );
1147 shift32RightJamming( bSig, expDiff, &bSig );
1152 normalizeRoundAndPack:
1154 return normalizeRoundAndPackFloat32( zSign, zExp, zSig );
1159 -------------------------------------------------------------------------------
1160 Returns the result of adding the single-precision floating-point values `a'
1161 and `b'. The operation is performed according to the IEC/IEEE Standard for
1162 Binary Floating-point Arithmetic.
1163 -------------------------------------------------------------------------------
1165 float32 float32_add( float32 a, float32 b )
1169 aSign = extractFloat32Sign( a );
1170 bSign = extractFloat32Sign( b );
1171 if ( aSign == bSign ) {
1172 return addFloat32Sigs( a, b, aSign );
1175 return subFloat32Sigs( a, b, aSign );
1181 -------------------------------------------------------------------------------
1182 Returns the result of subtracting the single-precision floating-point values
1183 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1184 for Binary Floating-point Arithmetic.
1185 -------------------------------------------------------------------------------
1187 float32 float32_sub( float32 a, float32 b )
1191 aSign = extractFloat32Sign( a );
1192 bSign = extractFloat32Sign( b );
1193 if ( aSign == bSign ) {
1194 return subFloat32Sigs( a, b, aSign );
1197 return addFloat32Sigs( a, b, aSign );
1203 -------------------------------------------------------------------------------
1204 Returns the result of multiplying the single-precision floating-point values
1205 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1206 for Binary Floating-point Arithmetic.
1207 -------------------------------------------------------------------------------
1209 float32 float32_mul( float32 a, float32 b )
1211 flag aSign, bSign, zSign;
1212 int16 aExp, bExp, zExp;
1217 aSig = extractFloat32Frac( a );
1218 aExp = extractFloat32Exp( a );
1219 aSign = extractFloat32Sign( a );
1220 bSig = extractFloat32Frac( b );
1221 bExp = extractFloat32Exp( b );
1222 bSign = extractFloat32Sign( b );
1223 zSign = aSign ^ bSign;
1224 if ( aExp == 0xFF ) {
1225 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1226 return propagateFloat32NaN( a, b );
1228 if ( ( bExp | bSig ) == 0 ) {
1229 float_raise( float_flag_invalid );
1230 return float32_default_nan;
1232 return packFloat32( zSign, 0xFF, 0 );
1234 if ( bExp == 0xFF ) {
1235 if ( bSig ) return propagateFloat32NaN( a, b );
1236 if ( ( aExp | aSig ) == 0 ) {
1237 float_raise( float_flag_invalid );
1238 return float32_default_nan;
1240 return packFloat32( zSign, 0xFF, 0 );
1243 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1244 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1247 if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
1248 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1250 zExp = aExp + bExp - 0x7F;
1251 aSig = ( aSig | 0x00800000 )<<7;
1252 bSig = ( bSig | 0x00800000 )<<8;
1253 shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 );
1255 if ( 0 <= (sbits32) ( zSig<<1 ) ) {
1259 return roundAndPackFloat32( zSign, zExp, zSig );
1264 -------------------------------------------------------------------------------
1265 Returns the result of dividing the single-precision floating-point value `a'
1266 by the corresponding value `b'. The operation is performed according to the
1267 IEC/IEEE Standard for Binary Floating-point Arithmetic.
1268 -------------------------------------------------------------------------------
1270 float32 float32_div( float32 a, float32 b )
1272 flag aSign, bSign, zSign;
1273 int16 aExp, bExp, zExp;
1274 bits32 aSig, bSig, zSig;
1276 aSig = extractFloat32Frac( a );
1277 aExp = extractFloat32Exp( a );
1278 aSign = extractFloat32Sign( a );
1279 bSig = extractFloat32Frac( b );
1280 bExp = extractFloat32Exp( b );
1281 bSign = extractFloat32Sign( b );
1282 zSign = aSign ^ bSign;
1283 if ( aExp == 0xFF ) {
1284 if ( aSig ) return propagateFloat32NaN( a, b );
1285 if ( bExp == 0xFF ) {
1286 if ( bSig ) return propagateFloat32NaN( a, b );
1287 float_raise( float_flag_invalid );
1288 return float32_default_nan;
1290 return packFloat32( zSign, 0xFF, 0 );
1292 if ( bExp == 0xFF ) {
1293 if ( bSig ) return propagateFloat32NaN( a, b );
1294 return packFloat32( zSign, 0, 0 );
1298 if ( ( aExp | aSig ) == 0 ) {
1299 float_raise( float_flag_invalid );
1300 return float32_default_nan;
1302 float_raise( float_flag_divbyzero );
1303 return packFloat32( zSign, 0xFF, 0 );
1305 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1308 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1309 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1311 zExp = aExp - bExp + 0x7D;
1312 aSig = ( aSig | 0x00800000 )<<7;
1313 bSig = ( bSig | 0x00800000 )<<8;
1314 if ( bSig <= ( aSig + aSig ) ) {
1318 zSig = ( ( (bits64) aSig )<<32 ) / bSig;
1319 if ( ( zSig & 0x3F ) == 0 ) {
1320 zSig |= ( ( (bits64) bSig ) * zSig != ( (bits64) aSig )<<32 );
1322 return roundAndPackFloat32( zSign, zExp, zSig );
1327 -------------------------------------------------------------------------------
1328 Returns the remainder of the single-precision floating-point value `a'
1329 with respect to the corresponding value `b'. The operation is performed
1330 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
1331 -------------------------------------------------------------------------------
1333 float32 float32_rem( float32 a, float32 b )
1335 flag aSign, bSign, zSign;
1336 int16 aExp, bExp, expDiff;
1339 bits64 aSig64, bSig64, q64;
1340 bits32 alternateASig;
1343 aSig = extractFloat32Frac( a );
1344 aExp = extractFloat32Exp( a );
1345 aSign = extractFloat32Sign( a );
1346 bSig = extractFloat32Frac( b );
1347 bExp = extractFloat32Exp( b );
1348 bSign = extractFloat32Sign( b );
1349 if ( aExp == 0xFF ) {
1350 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1351 return propagateFloat32NaN( a, b );
1353 float_raise( float_flag_invalid );
1354 return float32_default_nan;
1356 if ( bExp == 0xFF ) {
1357 if ( bSig ) return propagateFloat32NaN( a, b );
1362 float_raise( float_flag_invalid );
1363 return float32_default_nan;
1365 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1368 if ( aSig == 0 ) return a;
1369 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1371 expDiff = aExp - bExp;
1374 if ( expDiff < 32 ) {
1377 if ( expDiff < 0 ) {
1378 if ( expDiff < -1 ) return a;
1381 q = ( bSig <= aSig );
1382 if ( q ) aSig -= bSig;
1383 if ( 0 < expDiff ) {
1384 q = ( ( (bits64) aSig )<<32 ) / bSig;
1387 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
1395 if ( bSig <= aSig ) aSig -= bSig;
1396 aSig64 = ( (bits64) aSig )<<40;
1397 bSig64 = ( (bits64) bSig )<<40;
1399 while ( 0 < expDiff ) {
1400 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
1401 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
1402 aSig64 = - ( ( bSig * q64 )<<38 );
1406 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
1407 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
1408 q = q64>>( 64 - expDiff );
1410 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
1413 alternateASig = aSig;
1416 } while ( 0 <= (sbits32) aSig );
1417 sigMean = aSig + alternateASig;
1418 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
1419 aSig = alternateASig;
1421 zSign = ( (sbits32) aSig < 0 );
1422 if ( zSign ) aSig = - aSig;
1423 return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig );
1428 -------------------------------------------------------------------------------
1429 Returns the square root of the single-precision floating-point value `a'.
1430 The operation is performed according to the IEC/IEEE Standard for Binary
1431 Floating-point Arithmetic.
1432 -------------------------------------------------------------------------------
1434 float32 float32_sqrt( float32 a )
1441 aSig = extractFloat32Frac( a );
1442 aExp = extractFloat32Exp( a );
1443 aSign = extractFloat32Sign( a );
1444 if ( aExp == 0xFF ) {
1445 if ( aSig ) return propagateFloat32NaN( a, 0 );
1446 if ( ! aSign ) return a;
1447 float_raise( float_flag_invalid );
1448 return float32_default_nan;
1451 if ( ( aExp | aSig ) == 0 ) return a;
1452 float_raise( float_flag_invalid );
1453 return float32_default_nan;
1456 if ( aSig == 0 ) return 0;
1457 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1459 zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
1460 aSig = ( aSig | 0x00800000 )<<8;
1461 zSig = estimateSqrt32( aExp, aSig ) + 2;
1462 if ( ( zSig & 0x7F ) <= 5 ) {
1468 term = ( (bits64) zSig ) * zSig;
1469 rem = ( ( (bits64) aSig )<<32 ) - term;
1470 while ( (sbits64) rem < 0 ) {
1472 rem += ( ( (bits64) zSig )<<1 ) | 1;
1474 zSig |= ( rem != 0 );
1477 shift32RightJamming( zSig, 1, &zSig );
1478 return roundAndPackFloat32( 0, zExp, zSig );
1483 -------------------------------------------------------------------------------
1484 Returns 1 if the single-precision floating-point value `a' is equal to the
1485 corresponding value `b', and 0 otherwise. The comparison is performed
1486 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
1487 -------------------------------------------------------------------------------
1489 flag float32_eq( float32 a, float32 b )
1492 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1493 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1495 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
1496 float_raise( float_flag_invalid );
1500 return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
1505 -------------------------------------------------------------------------------
1506 Returns 1 if the single-precision floating-point value `a' is less than or
1507 equal to the corresponding value `b', and 0 otherwise. The comparison is
1508 performed according to the IEC/IEEE Standard for Binary Floating-point
1510 -------------------------------------------------------------------------------
1512 flag float32_le( float32 a, float32 b )
1516 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1517 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1519 float_raise( float_flag_invalid );
1522 aSign = extractFloat32Sign( a );
1523 bSign = extractFloat32Sign( b );
1524 if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
1525 return ( a == b ) || ( aSign ^ ( a < b ) );
1530 -------------------------------------------------------------------------------
1531 Returns 1 if the single-precision floating-point value `a' is less than
1532 the corresponding value `b', and 0 otherwise. The comparison is performed
1533 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
1534 -------------------------------------------------------------------------------
1536 flag float32_lt( float32 a, float32 b )
1540 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1541 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1543 float_raise( float_flag_invalid );
1546 aSign = extractFloat32Sign( a );
1547 bSign = extractFloat32Sign( b );
1548 if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
1549 return ( a != b ) && ( aSign ^ ( a < b ) );
1554 -------------------------------------------------------------------------------
1555 Returns 1 if the single-precision floating-point value `a' is equal to the
1556 corresponding value `b', and 0 otherwise. The invalid exception is raised
1557 if either operand is a NaN. Otherwise, the comparison is performed
1558 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
1559 -------------------------------------------------------------------------------
1561 flag float32_eq_signaling( float32 a, float32 b )
1564 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1565 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1567 float_raise( float_flag_invalid );
1570 return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
1575 -------------------------------------------------------------------------------
1576 Returns 1 if the single-precision floating-point value `a' is less than or
1577 equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
1578 cause an exception. Otherwise, the comparison is performed according to the
1579 IEC/IEEE Standard for Binary Floating-point Arithmetic.
1580 -------------------------------------------------------------------------------
1582 flag float32_le_quiet( float32 a, float32 b )
1587 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1588 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1590 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
1591 float_raise( float_flag_invalid );
1595 aSign = extractFloat32Sign( a );
1596 bSign = extractFloat32Sign( b );
1597 if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
1598 return ( a == b ) || ( aSign ^ ( a < b ) );
1603 -------------------------------------------------------------------------------
1604 Returns 1 if the single-precision floating-point value `a' is less than
1605 the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
1606 exception. Otherwise, the comparison is performed according to the IEC/IEEE
1607 Standard for Binary Floating-point Arithmetic.
1608 -------------------------------------------------------------------------------
1610 flag float32_lt_quiet( float32 a, float32 b )
1614 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1615 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1617 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
1618 float_raise( float_flag_invalid );
1622 aSign = extractFloat32Sign( a );
1623 bSign = extractFloat32Sign( b );
1624 if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
1625 return ( a != b ) && ( aSign ^ ( a < b ) );
1630 -------------------------------------------------------------------------------
1631 Returns the result of converting the double-precision floating-point value
1632 `a' to the 32-bit two's complement integer format. The conversion is
1633 performed according to the IEC/IEEE Standard for Binary Floating-point
1634 Arithmetic---which means in particular that the conversion is rounded
1635 according to the current rounding mode. If `a' is a NaN, the largest
1636 positive integer is returned. Otherwise, if the conversion overflows, the
1637 largest integer with the same sign as `a' is returned.
1638 -------------------------------------------------------------------------------
1640 int32 float64_to_int32( float64 a )
1643 int16 aExp, shiftCount;
1646 aSig = extractFloat64Frac( a );
1647 aExp = extractFloat64Exp( a );
1648 aSign = extractFloat64Sign( a );
1649 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
1650 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
1651 shiftCount = 0x42C - aExp;
1652 if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
1653 return roundAndPackInt32( aSign, aSig );
1658 -------------------------------------------------------------------------------
1659 Returns the result of converting the double-precision floating-point value
1660 `a' to the 32-bit two's complement integer format. The conversion is
1661 performed according to the IEC/IEEE Standard for Binary Floating-point
1662 Arithmetic, except that the conversion is always rounded toward zero. If
1663 `a' is a NaN, the largest positive integer is returned. Otherwise, if the
1664 conversion overflows, the largest integer with the same sign as `a' is
1666 -------------------------------------------------------------------------------
1668 int32 float64_to_int32_round_to_zero( float64 a )
1671 int16 aExp, shiftCount;
1672 bits64 aSig, savedASig;
1675 aSig = extractFloat64Frac( a );
1676 aExp = extractFloat64Exp( a );
1677 aSign = extractFloat64Sign( a );
1678 shiftCount = 0x433 - aExp;
1679 if ( shiftCount < 21 ) {
1680 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
1683 else if ( 52 < shiftCount ) {
1684 if ( aExp || aSig ) float_exception_flags |= float_flag_inexact;
1687 aSig |= LIT64( 0x0010000000000000 );
1689 aSig >>= shiftCount;
1691 if ( aSign ) z = - z;
1692 if ( ( z < 0 ) ^ aSign ) {
1694 float_exception_flags |= float_flag_invalid;
1695 return aSign ? 0x80000000 : 0x7FFFFFFF;
1697 if ( ( aSig<<shiftCount ) != savedASig ) {
1698 float_exception_flags |= float_flag_inexact;
1705 -------------------------------------------------------------------------------
1706 Returns the result of converting the double-precision floating-point value
1707 `a' to the 32-bit two's complement unsigned integer format. The conversion
1708 is performed according to the IEC/IEEE Standard for Binary Floating-point
1709 Arithmetic---which means in particular that the conversion is rounded
1710 according to the current rounding mode. If `a' is a NaN, the largest
1711 positive integer is returned. Otherwise, if the conversion overflows, the
1712 largest positive integer is returned.
1713 -------------------------------------------------------------------------------
1715 int32 float64_to_uint32( float64 a )
1718 int16 aExp, shiftCount;
1721 aSig = extractFloat64Frac( a );
1722 aExp = extractFloat64Exp( a );
1723 aSign = 0; //extractFloat64Sign( a );
1724 //if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
1725 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
1726 shiftCount = 0x42C - aExp;
1727 if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
1728 return roundAndPackInt32( aSign, aSig );
1732 -------------------------------------------------------------------------------
1733 Returns the result of converting the double-precision floating-point value
1734 `a' to the 32-bit two's complement integer format. The conversion is
1735 performed according to the IEC/IEEE Standard for Binary Floating-point
1736 Arithmetic, except that the conversion is always rounded toward zero. If
1737 `a' is a NaN, the largest positive integer is returned. Otherwise, if the
1738 conversion overflows, the largest positive integer is returned.
1739 -------------------------------------------------------------------------------
1741 int32 float64_to_uint32_round_to_zero( float64 a )
1744 int16 aExp, shiftCount;
1745 bits64 aSig, savedASig;
1748 aSig = extractFloat64Frac( a );
1749 aExp = extractFloat64Exp( a );
1750 aSign = extractFloat64Sign( a );
1751 shiftCount = 0x433 - aExp;
1752 if ( shiftCount < 21 ) {
1753 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
1756 else if ( 52 < shiftCount ) {
1757 if ( aExp || aSig ) float_exception_flags |= float_flag_inexact;
1760 aSig |= LIT64( 0x0010000000000000 );
1762 aSig >>= shiftCount;
1764 if ( aSign ) z = - z;
1765 if ( ( z < 0 ) ^ aSign ) {
1767 float_exception_flags |= float_flag_invalid;
1768 return aSign ? 0x80000000 : 0x7FFFFFFF;
1770 if ( ( aSig<<shiftCount ) != savedASig ) {
1771 float_exception_flags |= float_flag_inexact;
1777 -------------------------------------------------------------------------------
1778 Returns the result of converting the double-precision floating-point value
1779 `a' to the single-precision floating-point format. The conversion is
1780 performed according to the IEC/IEEE Standard for Binary Floating-point
1782 -------------------------------------------------------------------------------
1784 float32 float64_to_float32( float64 a )
1791 aSig = extractFloat64Frac( a );
1792 aExp = extractFloat64Exp( a );
1793 aSign = extractFloat64Sign( a );
1794 if ( aExp == 0x7FF ) {
1795 if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a ) );
1796 return packFloat32( aSign, 0xFF, 0 );
1798 shift64RightJamming( aSig, 22, &aSig );
1800 if ( aExp || zSig ) {
1804 return roundAndPackFloat32( aSign, aExp, zSig );
1811 -------------------------------------------------------------------------------
1812 Returns the result of converting the double-precision floating-point value
1813 `a' to the extended double-precision floating-point format. The conversion
1814 is performed according to the IEC/IEEE Standard for Binary Floating-point
1816 -------------------------------------------------------------------------------
1818 floatx80 float64_to_floatx80( float64 a )
1824 aSig = extractFloat64Frac( a );
1825 aExp = extractFloat64Exp( a );
1826 aSign = extractFloat64Sign( a );
1827 if ( aExp == 0x7FF ) {
1828 if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a ) );
1829 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1832 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1833 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
1837 aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
1844 -------------------------------------------------------------------------------
1845 Rounds the double-precision floating-point value `a' to an integer, and
1846 returns the result as a double-precision floating-point value. The
1847 operation is performed according to the IEC/IEEE Standard for Binary
1848 Floating-point Arithmetic.
1849 -------------------------------------------------------------------------------
1851 float64 float64_round_to_int( float64 a )
1855 bits64 lastBitMask, roundBitsMask;
1859 aExp = extractFloat64Exp( a );
1860 if ( 0x433 <= aExp ) {
1861 if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
1862 return propagateFloat64NaN( a, a );
1866 if ( aExp <= 0x3FE ) {
1867 if ( (bits64) ( a<<1 ) == 0 ) return a;
1868 float_exception_flags |= float_flag_inexact;
1869 aSign = extractFloat64Sign( a );
1870 switch ( float_rounding_mode ) {
1871 case float_round_nearest_even:
1872 if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
1873 return packFloat64( aSign, 0x3FF, 0 );
1876 case float_round_down:
1877 return aSign ? LIT64( 0xBFF0000000000000 ) : 0;
1878 case float_round_up:
1880 aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 );
1882 return packFloat64( aSign, 0, 0 );
1885 lastBitMask <<= 0x433 - aExp;
1886 roundBitsMask = lastBitMask - 1;
1888 roundingMode = float_rounding_mode;
1889 if ( roundingMode == float_round_nearest_even ) {
1890 z += lastBitMask>>1;
1891 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1893 else if ( roundingMode != float_round_to_zero ) {
1894 if ( extractFloat64Sign( z ) ^ ( roundingMode == float_round_up ) ) {
1898 z &= ~ roundBitsMask;
1899 if ( z != a ) float_exception_flags |= float_flag_inexact;
1905 -------------------------------------------------------------------------------
1906 Returns the result of adding the absolute values of the double-precision
1907 floating-point values `a' and `b'. If `zSign' is true, the sum is negated
1908 before being returned. `zSign' is ignored if the result is a NaN. The
1909 addition is performed according to the IEC/IEEE Standard for Binary
1910 Floating-point Arithmetic.
1911 -------------------------------------------------------------------------------
1913 static float64 addFloat64Sigs( float64 a, float64 b, flag zSign )
1915 int16 aExp, bExp, zExp;
1916 bits64 aSig, bSig, zSig;
1919 aSig = extractFloat64Frac( a );
1920 aExp = extractFloat64Exp( a );
1921 bSig = extractFloat64Frac( b );
1922 bExp = extractFloat64Exp( b );
1923 expDiff = aExp - bExp;
1926 if ( 0 < expDiff ) {
1927 if ( aExp == 0x7FF ) {
1928 if ( aSig ) return propagateFloat64NaN( a, b );
1935 bSig |= LIT64( 0x2000000000000000 );
1937 shift64RightJamming( bSig, expDiff, &bSig );
1940 else if ( expDiff < 0 ) {
1941 if ( bExp == 0x7FF ) {
1942 if ( bSig ) return propagateFloat64NaN( a, b );
1943 return packFloat64( zSign, 0x7FF, 0 );
1949 aSig |= LIT64( 0x2000000000000000 );
1951 shift64RightJamming( aSig, - expDiff, &aSig );
1955 if ( aExp == 0x7FF ) {
1956 if ( aSig | bSig ) return propagateFloat64NaN( a, b );
1959 if ( aExp == 0 ) return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
1960 zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
1964 aSig |= LIT64( 0x2000000000000000 );
1965 zSig = ( aSig + bSig )<<1;
1967 if ( (sbits64) zSig < 0 ) {
1972 return roundAndPackFloat64( zSign, zExp, zSig );
1977 -------------------------------------------------------------------------------
1978 Returns the result of subtracting the absolute values of the double-
1979 precision floating-point values `a' and `b'. If `zSign' is true, the
1980 difference is negated before being returned. `zSign' is ignored if the
1981 result is a NaN. The subtraction is performed according to the IEC/IEEE
1982 Standard for Binary Floating-point Arithmetic.
1983 -------------------------------------------------------------------------------
1985 static float64 subFloat64Sigs( float64 a, float64 b, flag zSign )
1987 int16 aExp, bExp, zExp;
1988 bits64 aSig, bSig, zSig;
1991 aSig = extractFloat64Frac( a );
1992 aExp = extractFloat64Exp( a );
1993 bSig = extractFloat64Frac( b );
1994 bExp = extractFloat64Exp( b );
1995 expDiff = aExp - bExp;
1998 if ( 0 < expDiff ) goto aExpBigger;
1999 if ( expDiff < 0 ) goto bExpBigger;
2000 if ( aExp == 0x7FF ) {
2001 if ( aSig | bSig ) return propagateFloat64NaN( a, b );
2002 float_raise( float_flag_invalid );
2003 return float64_default_nan;
2009 if ( bSig < aSig ) goto aBigger;
2010 if ( aSig < bSig ) goto bBigger;
2011 return packFloat64( float_rounding_mode == float_round_down, 0, 0 );
2013 if ( bExp == 0x7FF ) {
2014 if ( bSig ) return propagateFloat64NaN( a, b );
2015 return packFloat64( zSign ^ 1, 0x7FF, 0 );
2021 aSig |= LIT64( 0x4000000000000000 );
2023 shift64RightJamming( aSig, - expDiff, &aSig );
2024 bSig |= LIT64( 0x4000000000000000 );
2029 goto normalizeRoundAndPack;
2031 if ( aExp == 0x7FF ) {
2032 if ( aSig ) return propagateFloat64NaN( a, b );
2039 bSig |= LIT64( 0x4000000000000000 );
2041 shift64RightJamming( bSig, expDiff, &bSig );
2042 aSig |= LIT64( 0x4000000000000000 );
2046 normalizeRoundAndPack:
2048 return normalizeRoundAndPackFloat64( zSign, zExp, zSig );
2053 -------------------------------------------------------------------------------
2054 Returns the result of adding the double-precision floating-point values `a'
2055 and `b'. The operation is performed according to the IEC/IEEE Standard for
2056 Binary Floating-point Arithmetic.
2057 -------------------------------------------------------------------------------
2059 float64 float64_add( float64 a, float64 b )
2063 aSign = extractFloat64Sign( a );
2064 bSign = extractFloat64Sign( b );
2065 if ( aSign == bSign ) {
2066 return addFloat64Sigs( a, b, aSign );
2069 return subFloat64Sigs( a, b, aSign );
2075 -------------------------------------------------------------------------------
2076 Returns the result of subtracting the double-precision floating-point values
2077 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2078 for Binary Floating-point Arithmetic.
2079 -------------------------------------------------------------------------------
2081 float64 float64_sub( float64 a, float64 b )
2085 aSign = extractFloat64Sign( a );
2086 bSign = extractFloat64Sign( b );
2087 if ( aSign == bSign ) {
2088 return subFloat64Sigs( a, b, aSign );
2091 return addFloat64Sigs( a, b, aSign );
2097 -------------------------------------------------------------------------------
2098 Returns the result of multiplying the double-precision floating-point values
2099 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2100 for Binary Floating-point Arithmetic.
2101 -------------------------------------------------------------------------------
2103 float64 float64_mul( float64 a, float64 b )
2105 flag aSign, bSign, zSign;
2106 int16 aExp, bExp, zExp;
2107 bits64 aSig, bSig, zSig0, zSig1;
2109 aSig = extractFloat64Frac( a );
2110 aExp = extractFloat64Exp( a );
2111 aSign = extractFloat64Sign( a );
2112 bSig = extractFloat64Frac( b );
2113 bExp = extractFloat64Exp( b );
2114 bSign = extractFloat64Sign( b );
2115 zSign = aSign ^ bSign;
2116 if ( aExp == 0x7FF ) {
2117 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2118 return propagateFloat64NaN( a, b );
2120 if ( ( bExp | bSig ) == 0 ) {
2121 float_raise( float_flag_invalid );
2122 return float64_default_nan;
2124 return packFloat64( zSign, 0x7FF, 0 );
2126 if ( bExp == 0x7FF ) {
2127 if ( bSig ) return propagateFloat64NaN( a, b );
2128 if ( ( aExp | aSig ) == 0 ) {
2129 float_raise( float_flag_invalid );
2130 return float64_default_nan;
2132 return packFloat64( zSign, 0x7FF, 0 );
2135 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2136 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2139 if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
2140 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2142 zExp = aExp + bExp - 0x3FF;
2143 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2144 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2145 mul64To128( aSig, bSig, &zSig0, &zSig1 );
2146 zSig0 |= ( zSig1 != 0 );
2147 if ( 0 <= (sbits64) ( zSig0<<1 ) ) {
2151 return roundAndPackFloat64( zSign, zExp, zSig0 );
2156 -------------------------------------------------------------------------------
2157 Returns the result of dividing the double-precision floating-point value `a'
2158 by the corresponding value `b'. The operation is performed according to
2159 the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2160 -------------------------------------------------------------------------------
2162 float64 float64_div( float64 a, float64 b )
2164 flag aSign, bSign, zSign;
2165 int16 aExp, bExp, zExp;
2166 bits64 aSig, bSig, zSig;
2168 bits64 term0, term1;
2170 aSig = extractFloat64Frac( a );
2171 aExp = extractFloat64Exp( a );
2172 aSign = extractFloat64Sign( a );
2173 bSig = extractFloat64Frac( b );
2174 bExp = extractFloat64Exp( b );
2175 bSign = extractFloat64Sign( b );
2176 zSign = aSign ^ bSign;
2177 if ( aExp == 0x7FF ) {
2178 if ( aSig ) return propagateFloat64NaN( a, b );
2179 if ( bExp == 0x7FF ) {
2180 if ( bSig ) return propagateFloat64NaN( a, b );
2181 float_raise( float_flag_invalid );
2182 return float64_default_nan;
2184 return packFloat64( zSign, 0x7FF, 0 );
2186 if ( bExp == 0x7FF ) {
2187 if ( bSig ) return propagateFloat64NaN( a, b );
2188 return packFloat64( zSign, 0, 0 );
2192 if ( ( aExp | aSig ) == 0 ) {
2193 float_raise( float_flag_invalid );
2194 return float64_default_nan;
2196 float_raise( float_flag_divbyzero );
2197 return packFloat64( zSign, 0x7FF, 0 );
2199 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2202 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2203 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2205 zExp = aExp - bExp + 0x3FD;
2206 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2207 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2208 if ( bSig <= ( aSig + aSig ) ) {
2212 zSig = estimateDiv128To64( aSig, 0, bSig );
2213 if ( ( zSig & 0x1FF ) <= 2 ) {
2214 mul64To128( bSig, zSig, &term0, &term1 );
2215 sub128( aSig, 0, term0, term1, &rem0, &rem1 );
2216 while ( (sbits64) rem0 < 0 ) {
2218 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
2220 zSig |= ( rem1 != 0 );
2222 return roundAndPackFloat64( zSign, zExp, zSig );
2227 -------------------------------------------------------------------------------
2228 Returns the remainder of the double-precision floating-point value `a'
2229 with respect to the corresponding value `b'. The operation is performed
2230 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2231 -------------------------------------------------------------------------------
2233 float64 float64_rem( float64 a, float64 b )
2235 flag aSign, bSign, zSign;
2236 int16 aExp, bExp, expDiff;
2238 bits64 q, alternateASig;
2241 aSig = extractFloat64Frac( a );
2242 aExp = extractFloat64Exp( a );
2243 aSign = extractFloat64Sign( a );
2244 bSig = extractFloat64Frac( b );
2245 bExp = extractFloat64Exp( b );
2246 bSign = extractFloat64Sign( b );
2247 if ( aExp == 0x7FF ) {
2248 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2249 return propagateFloat64NaN( a, b );
2251 float_raise( float_flag_invalid );
2252 return float64_default_nan;
2254 if ( bExp == 0x7FF ) {
2255 if ( bSig ) return propagateFloat64NaN( a, b );
2260 float_raise( float_flag_invalid );
2261 return float64_default_nan;
2263 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2266 if ( aSig == 0 ) return a;
2267 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2269 expDiff = aExp - bExp;
2270 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
2271 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2272 if ( expDiff < 0 ) {
2273 if ( expDiff < -1 ) return a;
2276 q = ( bSig <= aSig );
2277 if ( q ) aSig -= bSig;
2279 while ( 0 < expDiff ) {
2280 q = estimateDiv128To64( aSig, 0, bSig );
2281 q = ( 2 < q ) ? q - 2 : 0;
2282 aSig = - ( ( bSig>>2 ) * q );
2286 if ( 0 < expDiff ) {
2287 q = estimateDiv128To64( aSig, 0, bSig );
2288 q = ( 2 < q ) ? q - 2 : 0;
2291 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2298 alternateASig = aSig;
2301 } while ( 0 <= (sbits64) aSig );
2302 sigMean = aSig + alternateASig;
2303 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2304 aSig = alternateASig;
2306 zSign = ( (sbits64) aSig < 0 );
2307 if ( zSign ) aSig = - aSig;
2308 return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig );
2313 -------------------------------------------------------------------------------
2314 Returns the square root of the double-precision floating-point value `a'.
2315 The operation is performed according to the IEC/IEEE Standard for Binary
2316 Floating-point Arithmetic.
2317 -------------------------------------------------------------------------------
2319 float64 float64_sqrt( float64 a )
2324 bits64 rem0, rem1, term0, term1; //, shiftedRem;
2327 aSig = extractFloat64Frac( a );
2328 aExp = extractFloat64Exp( a );
2329 aSign = extractFloat64Sign( a );
2330 if ( aExp == 0x7FF ) {
2331 if ( aSig ) return propagateFloat64NaN( a, a );
2332 if ( ! aSign ) return a;
2333 float_raise( float_flag_invalid );
2334 return float64_default_nan;
2337 if ( ( aExp | aSig ) == 0 ) return a;
2338 float_raise( float_flag_invalid );
2339 return float64_default_nan;
2342 if ( aSig == 0 ) return 0;
2343 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2345 zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
2346 aSig |= LIT64( 0x0010000000000000 );
2347 zSig = estimateSqrt32( aExp, aSig>>21 );
2349 aSig <<= 9 - ( aExp & 1 );
2350 zSig = estimateDiv128To64( aSig, 0, zSig ) + zSig + 2;
2351 if ( ( zSig & 0x3FF ) <= 5 ) {
2353 zSig = LIT64( 0xFFFFFFFFFFFFFFFF );
2357 mul64To128( zSig, zSig, &term0, &term1 );
2358 sub128( aSig, 0, term0, term1, &rem0, &rem1 );
2359 while ( (sbits64) rem0 < 0 ) {
2361 shortShift128Left( 0, zSig, 1, &term0, &term1 );
2363 add128( rem0, rem1, term0, term1, &rem0, &rem1 );
2365 zSig |= ( ( rem0 | rem1 ) != 0 );
2368 shift64RightJamming( zSig, 1, &zSig );
2369 return roundAndPackFloat64( 0, zExp, zSig );
2374 -------------------------------------------------------------------------------
2375 Returns 1 if the double-precision floating-point value `a' is equal to the
2376 corresponding value `b', and 0 otherwise. The comparison is performed
2377 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2378 -------------------------------------------------------------------------------
2380 flag float64_eq( float64 a, float64 b )
2383 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2384 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2386 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
2387 float_raise( float_flag_invalid );
2391 return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );
2396 -------------------------------------------------------------------------------
2397 Returns 1 if the double-precision floating-point value `a' is less than or
2398 equal to the corresponding value `b', and 0 otherwise. The comparison is
2399 performed according to the IEC/IEEE Standard for Binary Floating-point
2401 -------------------------------------------------------------------------------
2403 flag float64_le( float64 a, float64 b )
2407 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2408 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2410 float_raise( float_flag_invalid );
2413 aSign = extractFloat64Sign( a );
2414 bSign = extractFloat64Sign( b );
2415 if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );
2416 return ( a == b ) || ( aSign ^ ( a < b ) );
2421 -------------------------------------------------------------------------------
2422 Returns 1 if the double-precision floating-point value `a' is less than
2423 the corresponding value `b', and 0 otherwise. The comparison is performed
2424 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2425 -------------------------------------------------------------------------------
2427 flag float64_lt( float64 a, float64 b )
2431 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2432 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2434 float_raise( float_flag_invalid );
2437 aSign = extractFloat64Sign( a );
2438 bSign = extractFloat64Sign( b );
2439 if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );
2440 return ( a != b ) && ( aSign ^ ( a < b ) );
2445 -------------------------------------------------------------------------------
2446 Returns 1 if the double-precision floating-point value `a' is equal to the
2447 corresponding value `b', and 0 otherwise. The invalid exception is raised
2448 if either operand is a NaN. Otherwise, the comparison is performed
2449 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2450 -------------------------------------------------------------------------------
2452 flag float64_eq_signaling( float64 a, float64 b )
2455 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2456 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2458 float_raise( float_flag_invalid );
2461 return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );
2466 -------------------------------------------------------------------------------
2467 Returns 1 if the double-precision floating-point value `a' is less than or
2468 equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
2469 cause an exception. Otherwise, the comparison is performed according to the
2470 IEC/IEEE Standard for Binary Floating-point Arithmetic.
2471 -------------------------------------------------------------------------------
2473 flag float64_le_quiet( float64 a, float64 b )
2478 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2479 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2481 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
2482 float_raise( float_flag_invalid );
2486 aSign = extractFloat64Sign( a );
2487 bSign = extractFloat64Sign( b );
2488 if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );
2489 return ( a == b ) || ( aSign ^ ( a < b ) );
2494 -------------------------------------------------------------------------------
2495 Returns 1 if the double-precision floating-point value `a' is less than
2496 the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
2497 exception. Otherwise, the comparison is performed according to the IEC/IEEE
2498 Standard for Binary Floating-point Arithmetic.
2499 -------------------------------------------------------------------------------
2501 flag float64_lt_quiet( float64 a, float64 b )
2505 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2506 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2508 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
2509 float_raise( float_flag_invalid );
2513 aSign = extractFloat64Sign( a );
2514 bSign = extractFloat64Sign( b );
2515 if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );
2516 return ( a != b ) && ( aSign ^ ( a < b ) );
2523 -------------------------------------------------------------------------------
2524 Returns the result of converting the extended double-precision floating-
2525 point value `a' to the 32-bit two's complement integer format. The
2526 conversion is performed according to the IEC/IEEE Standard for Binary
2527 Floating-point Arithmetic---which means in particular that the conversion
2528 is rounded according to the current rounding mode. If `a' is a NaN, the
2529 largest positive integer is returned. Otherwise, if the conversion
2530 overflows, the largest integer with the same sign as `a' is returned.
2531 -------------------------------------------------------------------------------
2533 int32 floatx80_to_int32( floatx80 a )
2536 int32 aExp, shiftCount;
2539 aSig = extractFloatx80Frac( a );
2540 aExp = extractFloatx80Exp( a );
2541 aSign = extractFloatx80Sign( a );
2542 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
2543 shiftCount = 0x4037 - aExp;
2544 if ( shiftCount <= 0 ) shiftCount = 1;
2545 shift64RightJamming( aSig, shiftCount, &aSig );
2546 return roundAndPackInt32( aSign, aSig );
2551 -------------------------------------------------------------------------------
2552 Returns the result of converting the extended double-precision floating-
2553 point value `a' to the 32-bit two's complement integer format. The
2554 conversion is performed according to the IEC/IEEE Standard for Binary
2555 Floating-point Arithmetic, except that the conversion is always rounded
2556 toward zero. If `a' is a NaN, the largest positive integer is returned.
2557 Otherwise, if the conversion overflows, the largest integer with the same
2558 sign as `a' is returned.
2559 -------------------------------------------------------------------------------
2561 int32 floatx80_to_int32_round_to_zero( floatx80 a )
2564 int32 aExp, shiftCount;
2565 bits64 aSig, savedASig;
2568 aSig = extractFloatx80Frac( a );
2569 aExp = extractFloatx80Exp( a );
2570 aSign = extractFloatx80Sign( a );
2571 shiftCount = 0x403E - aExp;
2572 if ( shiftCount < 32 ) {
2573 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
2576 else if ( 63 < shiftCount ) {
2577 if ( aExp || aSig ) float_exception_flags |= float_flag_inexact;
2581 aSig >>= shiftCount;
2583 if ( aSign ) z = - z;
2584 if ( ( z < 0 ) ^ aSign ) {
2586 float_exception_flags |= float_flag_invalid;
2587 return aSign ? 0x80000000 : 0x7FFFFFFF;
2589 if ( ( aSig<<shiftCount ) != savedASig ) {
2590 float_exception_flags |= float_flag_inexact;
2597 -------------------------------------------------------------------------------
2598 Returns the result of converting the extended double-precision floating-
2599 point value `a' to the single-precision floating-point format. The
2600 conversion is performed according to the IEC/IEEE Standard for Binary
2601 Floating-point Arithmetic.
2602 -------------------------------------------------------------------------------
2604 float32 floatx80_to_float32( floatx80 a )
2610 aSig = extractFloatx80Frac( a );
2611 aExp = extractFloatx80Exp( a );
2612 aSign = extractFloatx80Sign( a );
2613 if ( aExp == 0x7FFF ) {
2614 if ( (bits64) ( aSig<<1 ) ) {
2615 return commonNaNToFloat32( floatx80ToCommonNaN( a ) );
2617 return packFloat32( aSign, 0xFF, 0 );
2619 shift64RightJamming( aSig, 33, &aSig );
2620 if ( aExp || aSig ) aExp -= 0x3F81;
2621 return roundAndPackFloat32( aSign, aExp, aSig );
2626 -------------------------------------------------------------------------------
2627 Returns the result of converting the extended double-precision floating-
2628 point value `a' to the double-precision floating-point format. The
2629 conversion is performed according to the IEC/IEEE Standard for Binary
2630 Floating-point Arithmetic.
2631 -------------------------------------------------------------------------------
2633 float64 floatx80_to_float64( floatx80 a )
2639 aSig = extractFloatx80Frac( a );
2640 aExp = extractFloatx80Exp( a );
2641 aSign = extractFloatx80Sign( a );
2642 if ( aExp == 0x7FFF ) {
2643 if ( (bits64) ( aSig<<1 ) ) {
2644 return commonNaNToFloat64( floatx80ToCommonNaN( a ) );
2646 return packFloat64( aSign, 0x7FF, 0 );
2648 shift64RightJamming( aSig, 1, &zSig );
2649 if ( aExp || aSig ) aExp -= 0x3C01;
2650 return roundAndPackFloat64( aSign, aExp, zSig );
2655 -------------------------------------------------------------------------------
2656 Rounds the extended double-precision floating-point value `a' to an integer,
2657 and returns the result as an extended quadruple-precision floating-point
2658 value. The operation is performed according to the IEC/IEEE Standard for
2659 Binary Floating-point Arithmetic.
2660 -------------------------------------------------------------------------------
2662 floatx80 floatx80_round_to_int( floatx80 a )
2666 bits64 lastBitMask, roundBitsMask;
2670 aExp = extractFloatx80Exp( a );
2671 if ( 0x403E <= aExp ) {
2672 if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) {
2673 return propagateFloatx80NaN( a, a );
2677 if ( aExp <= 0x3FFE ) {
2679 && ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
2682 float_exception_flags |= float_flag_inexact;
2683 aSign = extractFloatx80Sign( a );
2684 switch ( float_rounding_mode ) {
2685 case float_round_nearest_even:
2686 if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 )
2689 packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
2692 case float_round_down:
2695 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
2696 : packFloatx80( 0, 0, 0 );
2697 case float_round_up:
2699 aSign ? packFloatx80( 1, 0, 0 )
2700 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
2702 return packFloatx80( aSign, 0, 0 );
2705 lastBitMask <<= 0x403E - aExp;
2706 roundBitsMask = lastBitMask - 1;
2708 roundingMode = float_rounding_mode;
2709 if ( roundingMode == float_round_nearest_even ) {
2710 z.low += lastBitMask>>1;
2711 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
2713 else if ( roundingMode != float_round_to_zero ) {
2714 if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
2715 z.low += roundBitsMask;
2718 z.low &= ~ roundBitsMask;
2721 z.low = LIT64( 0x8000000000000000 );
2723 if ( z.low != a.low ) float_exception_flags |= float_flag_inexact;
2729 -------------------------------------------------------------------------------
2730 Returns the result of adding the absolute values of the extended double-
2731 precision floating-point values `a' and `b'. If `zSign' is true, the sum is
2732 negated before being returned. `zSign' is ignored if the result is a NaN.
2733 The addition is performed according to the IEC/IEEE Standard for Binary
2734 Floating-point Arithmetic.
2735 -------------------------------------------------------------------------------
2737 static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
2739 int32 aExp, bExp, zExp;
2740 bits64 aSig, bSig, zSig0, zSig1;
2743 aSig = extractFloatx80Frac( a );
2744 aExp = extractFloatx80Exp( a );
2745 bSig = extractFloatx80Frac( b );
2746 bExp = extractFloatx80Exp( b );
2747 expDiff = aExp - bExp;
2748 if ( 0 < expDiff ) {
2749 if ( aExp == 0x7FFF ) {
2750 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
2753 if ( bExp == 0 ) --expDiff;
2754 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
2757 else if ( expDiff < 0 ) {
2758 if ( bExp == 0x7FFF ) {
2759 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
2760 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2762 if ( aExp == 0 ) ++expDiff;
2763 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
2767 if ( aExp == 0x7FFF ) {
2768 if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
2769 return propagateFloatx80NaN( a, b );
2774 zSig0 = aSig + bSig;
2776 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
2783 zSig0 = aSig + bSig;
2785 if ( (sbits64) zSig0 < 0 ) goto roundAndPack;
2787 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
2788 zSig0 |= LIT64( 0x8000000000000000 );
2792 roundAndPackFloatx80(
2793 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
2798 -------------------------------------------------------------------------------
2799 Returns the result of subtracting the absolute values of the extended
2800 double-precision floating-point values `a' and `b'. If `zSign' is true,
2801 the difference is negated before being returned. `zSign' is ignored if the
2802 result is a NaN. The subtraction is performed according to the IEC/IEEE
2803 Standard for Binary Floating-point Arithmetic.
2804 -------------------------------------------------------------------------------
2806 static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
2808 int32 aExp, bExp, zExp;
2809 bits64 aSig, bSig, zSig0, zSig1;
2813 aSig = extractFloatx80Frac( a );
2814 aExp = extractFloatx80Exp( a );
2815 bSig = extractFloatx80Frac( b );
2816 bExp = extractFloatx80Exp( b );
2817 expDiff = aExp - bExp;
2818 if ( 0 < expDiff ) goto aExpBigger;
2819 if ( expDiff < 0 ) goto bExpBigger;
2820 if ( aExp == 0x7FFF ) {
2821 if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
2822 return propagateFloatx80NaN( a, b );
2824 float_raise( float_flag_invalid );
2825 z.low = floatx80_default_nan_low;
2826 z.high = floatx80_default_nan_high;
2834 if ( bSig < aSig ) goto aBigger;
2835 if ( aSig < bSig ) goto bBigger;
2836 return packFloatx80( float_rounding_mode == float_round_down, 0, 0 );
2838 if ( bExp == 0x7FFF ) {
2839 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
2840 return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
2842 if ( aExp == 0 ) ++expDiff;
2843 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
2845 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
2848 goto normalizeRoundAndPack;
2850 if ( aExp == 0x7FFF ) {
2851 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
2854 if ( bExp == 0 ) --expDiff;
2855 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
2857 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
2859 normalizeRoundAndPack:
2861 normalizeRoundAndPackFloatx80(
2862 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
2867 -------------------------------------------------------------------------------
2868 Returns the result of adding the extended double-precision floating-point
2869 values `a' and `b'. The operation is performed according to the IEC/IEEE
2870 Standard for Binary Floating-point Arithmetic.
2871 -------------------------------------------------------------------------------
2873 floatx80 floatx80_add( floatx80 a, floatx80 b )
2877 aSign = extractFloatx80Sign( a );
2878 bSign = extractFloatx80Sign( b );
2879 if ( aSign == bSign ) {
2880 return addFloatx80Sigs( a, b, aSign );
2883 return subFloatx80Sigs( a, b, aSign );
2889 -------------------------------------------------------------------------------
2890 Returns the result of subtracting the extended double-precision floating-
2891 point values `a' and `b'. The operation is performed according to the
2892 IEC/IEEE Standard for Binary Floating-point Arithmetic.
2893 -------------------------------------------------------------------------------
2895 floatx80 floatx80_sub( floatx80 a, floatx80 b )
2899 aSign = extractFloatx80Sign( a );
2900 bSign = extractFloatx80Sign( b );
2901 if ( aSign == bSign ) {
2902 return subFloatx80Sigs( a, b, aSign );
2905 return addFloatx80Sigs( a, b, aSign );
2911 -------------------------------------------------------------------------------
2912 Returns the result of multiplying the extended double-precision floating-
2913 point values `a' and `b'. The operation is performed according to the
2914 IEC/IEEE Standard for Binary Floating-point Arithmetic.
2915 -------------------------------------------------------------------------------
2917 floatx80 floatx80_mul( floatx80 a, floatx80 b )
2919 flag aSign, bSign, zSign;
2920 int32 aExp, bExp, zExp;
2921 bits64 aSig, bSig, zSig0, zSig1;
2924 aSig = extractFloatx80Frac( a );
2925 aExp = extractFloatx80Exp( a );
2926 aSign = extractFloatx80Sign( a );
2927 bSig = extractFloatx80Frac( b );
2928 bExp = extractFloatx80Exp( b );
2929 bSign = extractFloatx80Sign( b );
2930 zSign = aSign ^ bSign;
2931 if ( aExp == 0x7FFF ) {
2932 if ( (bits64) ( aSig<<1 )
2933 || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
2934 return propagateFloatx80NaN( a, b );
2936 if ( ( bExp | bSig ) == 0 ) goto invalid;
2937 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2939 if ( bExp == 0x7FFF ) {
2940 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
2941 if ( ( aExp | aSig ) == 0 ) {
2943 float_raise( float_flag_invalid );
2944 z.low = floatx80_default_nan_low;
2945 z.high = floatx80_default_nan_high;
2948 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2951 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
2952 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
2955 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
2956 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
2958 zExp = aExp + bExp - 0x3FFE;
2959 mul64To128( aSig, bSig, &zSig0, &zSig1 );
2960 if ( 0 < (sbits64) zSig0 ) {
2961 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
2965 roundAndPackFloatx80(
2966 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
2971 -------------------------------------------------------------------------------
2972 Returns the result of dividing the extended double-precision floating-point
2973 value `a' by the corresponding value `b'. The operation is performed
2974 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2975 -------------------------------------------------------------------------------
2977 floatx80 floatx80_div( floatx80 a, floatx80 b )
2979 flag aSign, bSign, zSign;
2980 int32 aExp, bExp, zExp;
2981 bits64 aSig, bSig, zSig0, zSig1;
2982 bits64 rem0, rem1, rem2, term0, term1, term2;
2985 aSig = extractFloatx80Frac( a );
2986 aExp = extractFloatx80Exp( a );
2987 aSign = extractFloatx80Sign( a );
2988 bSig = extractFloatx80Frac( b );
2989 bExp = extractFloatx80Exp( b );
2990 bSign = extractFloatx80Sign( b );
2991 zSign = aSign ^ bSign;
2992 if ( aExp == 0x7FFF ) {
2993 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
2994 if ( bExp == 0x7FFF ) {
2995 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
2998 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3000 if ( bExp == 0x7FFF ) {
3001 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3002 return packFloatx80( zSign, 0, 0 );
3006 if ( ( aExp | aSig ) == 0 ) {
3008 float_raise( float_flag_invalid );
3009 z.low = floatx80_default_nan_low;
3010 z.high = floatx80_default_nan_high;
3013 float_raise( float_flag_divbyzero );
3014 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3016 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3019 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3020 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3022 zExp = aExp - bExp + 0x3FFE;
3024 if ( bSig <= aSig ) {
3025 shift128Right( aSig, 0, 1, &aSig, &rem1 );
3028 zSig0 = estimateDiv128To64( aSig, rem1, bSig );
3029 mul64To128( bSig, zSig0, &term0, &term1 );
3030 sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
3031 while ( (sbits64) rem0 < 0 ) {
3033 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3035 zSig1 = estimateDiv128To64( rem1, 0, bSig );
3036 if ( (bits64) ( zSig1<<1 ) <= 8 ) {
3037 mul64To128( bSig, zSig1, &term1, &term2 );
3038 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
3039 while ( (sbits64) rem1 < 0 ) {
3041 add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
3043 zSig1 |= ( ( rem1 | rem2 ) != 0 );
3046 roundAndPackFloatx80(
3047 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3052 -------------------------------------------------------------------------------
3053 Returns the remainder of the extended double-precision floating-point value
3054 `a' with respect to the corresponding value `b'. The operation is performed
3055 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
3056 -------------------------------------------------------------------------------
3058 floatx80 floatx80_rem( floatx80 a, floatx80 b )
3060 flag aSign, bSign, zSign;
3061 int32 aExp, bExp, expDiff;
3062 bits64 aSig0, aSig1, bSig;
3063 bits64 q, term0, term1, alternateASig0, alternateASig1;
3066 aSig0 = extractFloatx80Frac( a );
3067 aExp = extractFloatx80Exp( a );
3068 aSign = extractFloatx80Sign( a );
3069 bSig = extractFloatx80Frac( b );
3070 bExp = extractFloatx80Exp( b );
3071 bSign = extractFloatx80Sign( b );
3072 if ( aExp == 0x7FFF ) {
3073 if ( (bits64) ( aSig0<<1 )
3074 || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3075 return propagateFloatx80NaN( a, b );
3079 if ( bExp == 0x7FFF ) {
3080 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3086 float_raise( float_flag_invalid );
3087 z.low = floatx80_default_nan_low;
3088 z.high = floatx80_default_nan_high;
3091 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3094 if ( (bits64) ( aSig0<<1 ) == 0 ) return a;
3095 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
3097 bSig |= LIT64( 0x8000000000000000 );
3099 expDiff = aExp - bExp;
3101 if ( expDiff < 0 ) {
3102 if ( expDiff < -1 ) return a;
3103 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
3106 q = ( bSig <= aSig0 );
3107 if ( q ) aSig0 -= bSig;
3109 while ( 0 < expDiff ) {
3110 q = estimateDiv128To64( aSig0, aSig1, bSig );
3111 q = ( 2 < q ) ? q - 2 : 0;
3112 mul64To128( bSig, q, &term0, &term1 );
3113 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3114 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
3118 if ( 0 < expDiff ) {
3119 q = estimateDiv128To64( aSig0, aSig1, bSig );
3120 q = ( 2 < q ) ? q - 2 : 0;
3122 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
3123 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3124 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
3125 while ( le128( term0, term1, aSig0, aSig1 ) ) {
3127 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3134 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
3135 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
3136 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
3139 aSig0 = alternateASig0;
3140 aSig1 = alternateASig1;
3144 normalizeRoundAndPackFloatx80(
3145 80, zSign, bExp + expDiff, aSig0, aSig1 );
3150 -------------------------------------------------------------------------------
3151 Returns the square root of the extended double-precision floating-point
3152 value `a'. The operation is performed according to the IEC/IEEE Standard
3153 for Binary Floating-point Arithmetic.
3154 -------------------------------------------------------------------------------
3156 floatx80 floatx80_sqrt( floatx80 a )
3160 bits64 aSig0, aSig1, zSig0, zSig1;
3161 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
3162 bits64 shiftedRem0, shiftedRem1;
3165 aSig0 = extractFloatx80Frac( a );
3166 aExp = extractFloatx80Exp( a );
3167 aSign = extractFloatx80Sign( a );
3168 if ( aExp == 0x7FFF ) {
3169 if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a );
3170 if ( ! aSign ) return a;
3174 if ( ( aExp | aSig0 ) == 0 ) return a;
3176 float_raise( float_flag_invalid );
3177 z.low = floatx80_default_nan_low;
3178 z.high = floatx80_default_nan_high;
3182 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
3183 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
3185 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
3186 zSig0 = estimateSqrt32( aExp, aSig0>>32 );
3189 shift128Right( aSig0, 0, ( aExp & 1 ) + 2, &aSig0, &aSig1 );
3190 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0 ) + zSig0 + 4;
3191 if ( 0 <= (sbits64) zSig0 ) zSig0 = LIT64( 0xFFFFFFFFFFFFFFFF );
3192 shortShift128Left( aSig0, aSig1, 2, &aSig0, &aSig1 );
3193 mul64To128( zSig0, zSig0, &term0, &term1 );
3194 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
3195 while ( (sbits64) rem0 < 0 ) {
3197 shortShift128Left( 0, zSig0, 1, &term0, &term1 );
3199 add128( rem0, rem1, term0, term1, &rem0, &rem1 );
3201 shortShift128Left( rem0, rem1, 63, &shiftedRem0, &shiftedRem1 );
3202 zSig1 = estimateDiv128To64( shiftedRem0, shiftedRem1, zSig0 );
3203 if ( (bits64) ( zSig1<<1 ) <= 10 ) {
3204 if ( zSig1 == 0 ) zSig1 = 1;
3205 mul64To128( zSig0, zSig1, &term1, &term2 );
3206 shortShift128Left( term1, term2, 1, &term1, &term2 );
3207 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
3208 mul64To128( zSig1, zSig1, &term2, &term3 );
3209 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
3210 while ( (sbits64) rem1 < 0 ) {
3212 shortShift192Left( 0, zSig0, zSig1, 1, &term1, &term2, &term3 );
3215 rem1, rem2, rem3, term1, term2, term3, &rem1, &rem2, &rem3 );
3217 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
3220 roundAndPackFloatx80(
3221 floatx80_rounding_precision, 0, zExp, zSig0, zSig1 );
3226 -------------------------------------------------------------------------------
3227 Returns 1 if the extended double-precision floating-point value `a' is
3228 equal to the corresponding value `b', and 0 otherwise. The comparison is
3229 performed according to the IEC/IEEE Standard for Binary Floating-point
3231 -------------------------------------------------------------------------------
3233 flag floatx80_eq( floatx80 a, floatx80 b )
3236 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
3237 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3238 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
3239 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3241 if ( floatx80_is_signaling_nan( a )
3242 || floatx80_is_signaling_nan( b ) ) {
3243 float_raise( float_flag_invalid );
3249 && ( ( a.high == b.high )
3251 && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
3257 -------------------------------------------------------------------------------
3258 Returns 1 if the extended double-precision floating-point value `a' is
3259 less than or equal to the corresponding value `b', and 0 otherwise. The
3260 comparison is performed according to the IEC/IEEE Standard for Binary
3261 Floating-point Arithmetic.
3262 -------------------------------------------------------------------------------
3264 flag floatx80_le( floatx80 a, floatx80 b )
3268 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
3269 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3270 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
3271 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3273 float_raise( float_flag_invalid );
3276 aSign = extractFloatx80Sign( a );
3277 bSign = extractFloatx80Sign( b );
3278 if ( aSign != bSign ) {
3281 || ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
3285 aSign ? le128( b.high, b.low, a.high, a.low )
3286 : le128( a.high, a.low, b.high, b.low );
3291 -------------------------------------------------------------------------------
3292 Returns 1 if the extended double-precision floating-point value `a' is
3293 less than the corresponding value `b', and 0 otherwise. The comparison
3294 is performed according to the IEC/IEEE Standard for Binary Floating-point
3296 -------------------------------------------------------------------------------
3298 flag floatx80_lt( floatx80 a, floatx80 b )
3302 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
3303 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3304 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
3305 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3307 float_raise( float_flag_invalid );
3310 aSign = extractFloatx80Sign( a );
3311 bSign = extractFloatx80Sign( b );
3312 if ( aSign != bSign ) {
3315 && ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
3319 aSign ? lt128( b.high, b.low, a.high, a.low )
3320 : lt128( a.high, a.low, b.high, b.low );
3325 -------------------------------------------------------------------------------
3326 Returns 1 if the extended double-precision floating-point value `a' is equal
3327 to the corresponding value `b', and 0 otherwise. The invalid exception is
3328 raised if either operand is a NaN. Otherwise, the comparison is performed
3329 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
3330 -------------------------------------------------------------------------------
3332 flag floatx80_eq_signaling( floatx80 a, floatx80 b )
3335 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
3336 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3337 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
3338 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3340 float_raise( float_flag_invalid );
3345 && ( ( a.high == b.high )
3347 && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
3353 -------------------------------------------------------------------------------
3354 Returns 1 if the extended double-precision floating-point value `a' is less
3355 than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
3356 do not cause an exception. Otherwise, the comparison is performed according
3357 to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
3358 -------------------------------------------------------------------------------
3360 flag floatx80_le_quiet( floatx80 a, floatx80 b )
3364 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
3365 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3366 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
3367 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3369 if ( floatx80_is_signaling_nan( a )
3370 || floatx80_is_signaling_nan( b ) ) {
3371 float_raise( float_flag_invalid );
3375 aSign = extractFloatx80Sign( a );
3376 bSign = extractFloatx80Sign( b );
3377 if ( aSign != bSign ) {
3380 || ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
3384 aSign ? le128( b.high, b.low, a.high, a.low )
3385 : le128( a.high, a.low, b.high, b.low );
3390 -------------------------------------------------------------------------------
3391 Returns 1 if the extended double-precision floating-point value `a' is less
3392 than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
3393 an exception. Otherwise, the comparison is performed according to the
3394 IEC/IEEE Standard for Binary Floating-point Arithmetic.
3395 -------------------------------------------------------------------------------
3397 flag floatx80_lt_quiet( floatx80 a, floatx80 b )
3401 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
3402 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3403 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
3404 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3406 if ( floatx80_is_signaling_nan( a )
3407 || floatx80_is_signaling_nan( b ) ) {
3408 float_raise( float_flag_invalid );
3412 aSign = extractFloatx80Sign( a );
3413 bSign = extractFloatx80Sign( b );
3414 if ( aSign != bSign ) {
3417 && ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
3421 aSign ? lt128( b.high, b.low, a.high, a.low )
3422 : lt128( a.high, a.low, b.high, b.low );