Leibniz's Theory of Pi has been a great algorithm for me to practice writing in various languages. I first wrote it in Python since I needed something nontrivial to do in my Visual Basic class in high school. It was only able to do about 400,000 iterations in an hour and a half. Two years later, I'm pretty sure my x86 assembly version is a little faster. In between then I've written it in C, D, FORTRAN, Python, Lua, and Ruby. As I've learned more, I've since programmed it using SSE3 instructions and later in DEC Alpha assembly.

C

#include <stdio.h>

int main() {
	double sum = 1.0;
	double denominator = 3.0;
	double numerator = 1.0;
	double temp;
	unsigned long counter = 0;
	
	while (1) {
		numerator = -numerator;
		temp = numerator;
		temp /= denominator;
		sum += temp;
		denominator += 2;
		counter++;
		if ( counter > 10000000 ) {
			printf("1/%.0f - %19.17f\n", denominator, sum);
			counter = 0;
		}
	}
	return 0;
}

Here's a version that uses threading and SDL for output.


x86 Assembly using the FPU (FASM syntax)

start:
	finit
	fild [two]
	fild [one]
	fild [one]
	fild [one]
	
	; align top of the loop to cache line size
top:
	fadd st0,st(3)
	fxch st(1)
	fchs
	fld st0
	fdiv st0, st(2)
	fadd st(3),st0
	fstp st0
	fxch st(1)
	
	; unroll loop as needed
	
	jmp top

	ret
	
one dd 1
two dd 2

x86-64 Assembly using SSE3 (FASM Syntax)

The heading says x86-64, but it could easily be modified to run on x86. It computes two terms in parallel using double precision.

format ELF64

extrn	printf
public	main

section '.text' executable align 64

main:
	push	rbp
	mov	rbp,rsp
	movdqa	xmm6,dqword[denom]
	movdqa	xmm2,dqword[numer]
	movdqa	xmm3,dqword[add4]
	movdqa	xmm4,xmm2
	movdqa	xmm5,dqword[zero]
	mov	r12,10000000
	
align 64 
top:
	divpd	xmm2,xmm6
	addpd	xmm5,xmm2
	movdqa	xmm2,xmm4
	addpd	xmm6,xmm3

	; unroll loop as needed
	
	sub	r12,1
	jnz	top
	
	mov	r12,10000000
	movdqa	xmm1,xmm5
	movdqa	xmm0,xmm6
	haddpd	xmm1,xmm1
	
	mov	eax,2
	mov	edi,msg
	call	printf
	
	jmp	top
	
	leave
	
	ret

section '.data' writable align 16

denom:	dq	1.0,3.0
numer:	dq	4.0,-4.0
add4:	dq	4.0,4.0
zero:	dq	0.0,0.0

msg	db	"1/%.0f - %50.48f",0xA,0

DEC Alpha Assembly (GAS Syntax)

The following code was written and tested in order to study RISC architectures on my Alphaserver DS20L.

.data
F_ONE:	.t_floating	1.0
COUNT:	.quad		100000000
ONE:	.quad		1
PRINT:	.ascii		"1/%.0f - %19.17f\n"

.text
	.align	4
	.set	noreorder
	.arch	ev6
	.globl	main
	.ent	main

main:	ldgp	$gp,0($27)
	stq	$26,0($sp)

	ldt	$f2,F_ONE		# $f2 num = 1.0
	ldq	$10,COUNT		# $10 = counter
	cpyse	$f2,$f2,$f3		# $f3 sum = 1.0
	addt	$f2,$f2,$f4		# $f4 incrementor = 2.0
	ldq	$9,ONE			# $9 = 1
	mov	$10,$2			# load counter into $2	
	addt	$f4,$f3,$f5		# $f5 denom = 3.0

	.align	4
loop:	cpysn	$f2,$f2,$f2		# num = -num
	subq	$2,$9,$2		# counter -= 1
	divt	$f2,$f5,$f10		# temp = num / denom
	addt	$f5,$f4,$f5		# denom += 2.0
	addt	$f3,$f10,$f3		# sum += temp

	bne	$2,loop			# if counter == 0, loop

	lda	$16,PRINT		# load format string
	cpyse	$f5,$f5,$f17		# $f17 = denom
	cpyse	$f3,$f3,$f18		# $f18 = sum
	jsr	$26,printf		# call printf
	ldgp	$gp,0($26)		# reload global pointer

	mov	$10,$2			# reset counter
	br	$31,loop		# loop back to top unconditionally

done:	mov	$31,$0			# return val = 0
	ldq	$26,0($sp)
	ret	$31,($26),1		# return
	.end	main