diff --git a/CHANGELOG b/CHANGELOG
index 517de23..caf6226 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,3 +1,8 @@
+1.2.8 
+  Changed memory.h to string.h -- apparently more standard
+  
+  Added openmp extensions.  This can have fairly linear speedups for larger FFT sizes.
+
 1.2.7 
   Shrank the real-fft memory footprint. Thanks to Galen Seitz.
 
diff --git a/Makefile b/Makefile
index e7f2131..b272b40 100644
--- a/Makefile
+++ b/Makefile
@@ -1,14 +1,18 @@
-KFVER=1_2_7
+KFVER=1_2_8
 
 DISTDIR=kiss_fft_v$(KFVER)
 TARBALL=kiss_fft_v$(KFVER).tar.gz
 ZIPFILE=kiss_fft_v$(KFVER).zip
 
+doc:
+	@echo "Start by reading the README file.  If you want to build and test lots of stuff, do a 'make testall'"
+	@echo "but be aware that 'make testall' has dependencies that the basic kissfft software does not"
+
 testall:
 	# The simd and int32_t types may or may not work on your machine 
-	#export DATATYPE=simd && cd test && make test
-	#export DATATYPE=int32_t && cd test && make test
-	#export DATATYPE=int16_t && cd test && make test
+	export DATATYPE=simd && cd test && make test
+	export DATATYPE=int32_t && cd test && make test
+	export DATATYPE=int16_t && cd test && make test
 	export DATATYPE=float && cd test && make test
 	export DATATYPE=double && cd test && make test
 
diff --git a/README b/README
index a5941ec..e1ce587 100644
--- a/README
+++ b/README
@@ -99,7 +99,7 @@ UNDER THE HOOD:
 
     Optimized butterflies are used for factors 2,3,4, and 5. 
 
-    The real optimization code only works for even length ffts.  It does two half-length
+    The real (i.e. not complex) optimization code only works for even length ffts.  It does two half-length
     FFTs in parallel (packed into real&imag), and then combines them via twiddling.
 
     The fast convolution filtering uses the overlap-scrap method, slightly 
diff --git a/TIPS b/TIPS
index c02f0c4..dc439dc 100644
--- a/TIPS
+++ b/TIPS
@@ -1,4 +1,8 @@
 Speed:
+    * If you want to use multiple cores, then compile with -openmp or -fopenmp (see your compiler docs).
+	Realize that larger FFTs will reap more benefit than smaller FFTs. This generally uses more CPU time, but
+	less wall time.
+
     * experiment with compiler flags
         Special thanks to Oscar Lesta. He suggested some compiler flags 
         for gcc that make a big difference. They shave 10-15% off
@@ -12,7 +16,7 @@ Speed:
 
     * If you can rearrange your code to do 4 FFTs in parallel and you are on a recent Intel or AMD machine,
     then you might want to experiment with the USE_SIMD code.
-    
+
 
 Reducing code size:
     * remove some of the butterflies. There are currently butterflies optimized for radices
diff --git a/kiss_fft.c b/kiss_fft.c
index c6b2807..fb048e7 100644
--- a/kiss_fft.c
+++ b/kiss_fft.c
@@ -262,6 +262,29 @@ void kf_work(
     const int m=*factors++; /* stage's fft length/p */
     const kiss_fft_cpx * Fout_end = Fout + p*m;
 
+#ifdef _OPENMP
+    // use openmp extensions at the 
+    // top-level (not recursive)
+    if (fstride==1) {
+        int k;
+
+        // execute the p different work units in different threads
+#       pragma omp parallel for
+        for (k=0;k<p;++k) 
+            kf_work( Fout +k*m, f+ fstride*in_stride*k,fstride*p,in_stride,factors,st);
+        // all threads have joined by this point
+
+        switch (p) {
+            case 2: kf_bfly2(Fout,fstride,st,m); break;
+            case 3: kf_bfly3(Fout,fstride,st,m); break; 
+            case 4: kf_bfly4(Fout,fstride,st,m); break;
+            case 5: kf_bfly5(Fout,fstride,st,m); break; 
+            default: kf_bfly_generic(Fout,fstride,st,m,p); break;
+        }
+        return;
+    }
+#endif
+
     if (m==1) {
         do{
             *Fout = *f;
diff --git a/test/Makefile b/test/Makefile
index 3c90ab9..945001c 100644
--- a/test/Makefile
+++ b/test/Makefile
@@ -3,6 +3,12 @@ WARNINGS=-W -Wall -Wstrict-prototypes -Wmissing-prototypes -Waggregate-return \
     -Wcast-align -Wcast-qual -Wnested-externs -Wshadow -Wbad-function-cast \
     -Wwrite-strings
 
+# for x86 pentium+ machines , these flags work well
+#CFLAGS=-O3  -march=pentiumpro -ffast-math -fomit-frame-pointer   -I.. -I../tools $(WARNINGS)
+# If the above flags do not work, try the following
+CFLAGS=-O3 -ffast-math -fomit-frame-pointer  -I.. -I../tools $(WARNINGS) 
+# TIP: try adding -openmp or -fopenmp  to enable OPENMP directives and use of multiple cores
+
 
 ifeq "$(NFFT)" ""
  NFFT=1800
@@ -55,10 +61,6 @@ all: tools $(BENCHKISS) $(SELFTEST) $(BENCHFFTW) $(TESTREAL) $(TESTKFC)
 tools:
 	cd ../tools && make all
 
-# for x86 pentium+ machines , these flags work well
-#CFLAGS=-O3  -march=pentiumpro -ffast-math -fomit-frame-pointer   -I.. -I../tools $(WARNINGS)
-# If the above flags do not work, try the following
-CFLAGS=-O3 -ffast-math -fomit-frame-pointer  -I.. -I../tools $(WARNINGS)
 
 $(SELFTEST): $(SELFTESTSRC) $(SRCFILES)
 	$(CC) -o $@ $(CFLAGS) $(TYPEFLAGS) -lm $+
diff --git a/tools/Makefile b/tools/Makefile
index 28e8cbd..c7b3950 100644
--- a/tools/Makefile
+++ b/tools/Makefile
@@ -35,7 +35,8 @@ all: $(FFTUTIL) $(FASTFILT) $(FASTFILTREAL)
 
 #CFLAGS=-Wall -O3 -pedantic -march=pentiumpro -ffast-math -fomit-frame-pointer $(WARNINGS)
 # If the above flags do not work, try the following
-CFLAGS=-Wall -O3 $(WARNINGS)
+CFLAGS=-Wall -O3 $(WARNINGS) 
+# tip: try -openmp or -fopenmp to use multiple cores
 
 $(FASTFILTREAL): ../kiss_fft.c kiss_fastfir.c kiss_fftr.c
 	$(CC) -o $@ $(CFLAGS) -I.. $(TYPEFLAGS) -DREAL_FASTFIR -lm $+ -DFAST_FILT_UTIL