Leibniz's Theory of Pi has been a great algorithm for me to practice writing in various languages. I first wrote it in Python since I needed something nontrivial to do in my Visual Basic class in high school. It was only able to do about 400,000 iterations in an hour and a half. Two years later, I'm pretty sure my x86 assembly version is a little faster. In between then I've written it in C, D, FORTRAN, Python, Lua, and Ruby. As I've learned more, I've since programmed it using SSE3 instructions and later in DEC Alpha assembly.


#include <stdio.h>

int main() {
	double sum = 1.0;
	double denominator = 3.0;
	double numerator = 1.0;
	double temp;
	unsigned long counter = 0;
	while (1) {
		numerator = -numerator;
		temp = numerator;
		temp /= denominator;
		sum += temp;
		denominator += 2;
		if ( counter > 10000000 ) {
			printf("1/%.0f - %19.17f\n", denominator, sum);
			counter = 0;
	return 0;

Here's a version that uses threading and SDL for output.

x86 Assembly using the FPU (FASM syntax)

	fild [two]
	fild [one]
	fild [one]
	fild [one]
	; align top of the loop to cache line size
	fadd st0,st(3)
	fxch st(1)
	fld st0
	fdiv st0, st(2)
	fadd st(3),st0
	fstp st0
	fxch st(1)
	; unroll loop as needed
	jmp top

one dd 1
two dd 2

x86-64 Assembly using SSE3 (FASM Syntax)

The heading says x86-64, but it could easily be modified to run on x86. It computes two terms in parallel using double precision.

format ELF64

extrn	printf
public	main

section '.text' executable align 64

	push	rbp
	mov	rbp,rsp
	movdqa	xmm6,dqword[denom]
	movdqa	xmm2,dqword[numer]
	movdqa	xmm3,dqword[add4]
	movdqa	xmm4,xmm2
	movdqa	xmm5,dqword[zero]
	mov	r12,10000000
align 64 
	divpd	xmm2,xmm6
	addpd	xmm5,xmm2
	movdqa	xmm2,xmm4
	addpd	xmm6,xmm3

	; unroll loop as needed
	sub	r12,1
	jnz	top
	mov	r12,10000000
	movdqa	xmm1,xmm5
	movdqa	xmm0,xmm6
	haddpd	xmm1,xmm1
	mov	eax,2
	mov	edi,msg
	call	printf
	jmp	top

section '.data' writable align 16

denom:	dq	1.0,3.0
numer:	dq	4.0,-4.0
add4:	dq	4.0,4.0
zero:	dq	0.0,0.0

msg	db	"1/%.0f - %50.48f",0xA,0

DEC Alpha Assembly (GAS Syntax)

The following code was written and tested in order to study RISC architectures on my Alphaserver DS20L.

F_ONE:	.t_floating	1.0
COUNT:	.quad		100000000
ONE:	.quad		1
PRINT:	.ascii		"1/%.0f - %19.17f\n"

	.align	4
	.set	noreorder
	.arch	ev6
	.globl	main
	.ent	main

main:	ldgp	$gp,0($27)
	stq	$26,0($sp)

	ldt	$f2,F_ONE		# $f2 num = 1.0
	ldq	$10,COUNT		# $10 = counter
	cpys	$f2,$f2,$f3		# $f3 sum = 1.0
	addt	$f2,$f2,$f4		# $f4 incrementor = 2.0
	ldq	$9,ONE			# $9 = 1
	mov	$10,$2			# load counter into $2
	addt	$f4,$f3,$f5		# $f5 denom = 3.0

	.align	4
loop:	cpysn	$f2,$f2,$f2		# num = -num
	subq	$2,$9,$2		# counter -= 1
	divt	$f2,$f5,$f10		# temp = num / denom
	addt	$f5,$f4,$f5		# denom += 2.0
	addt	$f3,$f10,$f3		# sum += temp

	bne	$2,loop			# if counter == 0, loop

	lda	$16,PRINT		# load format string
	cpys	$f5,$f5,$f17		# $f17 = denom
	cpys	$f3,$f3,$f18		# $f18 = sum
	jsr	$26,printf		# call printf
	ldgp	$gp,0($26)		# reload global pointer

	mov	$10,$2			# reset counter
	br	$31,loop		# loop back to top unconditionally

done:	mov	$31,$0			# return val = 0
	ldq	$26,0($sp)
	ret	$31,($26),1		# return
	.end	main