diff --git a/include/alltypes.h.in b/include/alltypes.h.in
index 8478fb4..4cd2cb1 100644
--- a/include/alltypes.h.in
+++ b/include/alltypes.h.in
@@ -58,6 +58,10 @@ TYPEDEF struct { unsigned __attr; } pthread_mutexattr_t;
 TYPEDEF struct { unsigned __attr; } pthread_condattr_t;
 TYPEDEF struct { unsigned __attr; } pthread_barrierattr_t;
 TYPEDEF struct { unsigned __attr[2]; } pthread_rwlockattr_t;
+TYPEDEF pthread_cond_t cnd_t;
+TYPEDEF pthread_mutex_t mtx_t;
+
+
 
 TYPEDEF struct _IO_FILE FILE;
 
diff --git a/include/threads.h b/include/threads.h
new file mode 100644
index 0000000..313fdb4
--- /dev/null
+++ b/include/threads.h
@@ -0,0 +1,200 @@
+#ifndef _THREADS_H
+#define _THREADS_H
+
+/* This one is explicitly allowed to be included. */
+#include <time.h>
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define __NEED_cnd_t
+#define __NEED_mtx_t
+/* Until we come up with a better naming scheme, we need to expose
+   some pthread types. */
+#define __NEED_pthread_cond_t
+#define __NEED_pthread_mutex_t
+#define __NEED_pthread_once_t
+#define __NEED_pthread_t
+#define __NEED_pthread_key_t
+
+#include <bits/alltypes.h>
+#include <bits/errno.h>
+
+typedef struct __mtx_on_heap __mtx_t;
+void __mtx_unref(__mtx_t *);
+__mtx_t* __mtx_getref(mtx_t *);
+
+int __mtx_lock(__mtx_t *);
+int __mtx_trylock(__mtx_t *);
+int __mtx_unlock(__mtx_t *);
+int __mtx_timedlock(__mtx_t *restrict, const struct timespec *restrict);
+
+typedef struct __cnd_on_heap __cnd_t;
+void __cnd_unref(__cnd_t *);
+__cnd_t* __cnd_getref(cnd_t *, __mtx_t *);
+
+int __cnd_signal(__cnd_t *);
+int __cnd_broadcast(__cnd_t *);
+int __cnd_wait(__cnd_t *restrict, __mtx_t *restrict);
+
+typedef pthread_t thrd_t;
+typedef pthread_key_t tss_t;
+typedef int (*thrd_start_t)(void*);
+typedef void (*tss_dtor_t)(void*);
+
+typedef struct once_flag once_flag;
+
+/* Just have a copy of the definition for the moment. This has to be
+   replaced by sharing that declaration in alltypes.h */
+struct __once_cb {
+	void (*__f)(void *);
+	void *__x;
+	struct __ptcb *__next;
+};
+
+struct once_flag {
+  int __cntrl;
+  int __waiters;
+  struct __once_cb __cb;
+};
+
+#ifndef __THRD_EXPERIMENTAL
+# define __THRD_EXPERIMENTAL 1
+#endif
+
+  /* The following list of 9 integer constants makes up for the binary
+     compatibility of this C thread implementation. You must never
+     link code against versions of the C library that do not agree
+     upon these ABI parameters.
+
+     Additionally this implementation assumes that the 5 types have
+     the same size across C libraries and that these types can be
+     initialized by the default initializer.
+
+     The values for the 9 parameters are not fixed for now. Depending
+     on the choices of other implementations and the evolution of the
+     C standard they may still change. This should happen rarely, but
+     you have to consider the C thread feature to be experimental
+     until then, and be prepared to recompile your binary when linking
+     against a different implementation or a new version.
+
+     The macro __THRD_EXPERIMENTAL will be defined as long as we
+     consider this ABI to be unstable. This comes with some link time
+     checks and an overhead of some bytes. At your own risk you may
+     switch this check off by defining __THRD_EXPERIMENTAL macro to
+     0. */
+
+#define __THRD_SUCCESS      0
+#define __THRD_BUSY         EBUSY
+#define __THRD_ERROR        EINVAL
+#define __THRD_NOMEM        ENOMEM
+#define __THRD_TIMEDOUT     ETIMEDOUT
+#define __MTX_PLAIN         0
+#define __MTX_RECURSIVE     1
+#define __MTX_TIMED         2
+#define TSS_DTOR_ITERATIONS 4
+
+#define __THRD_CONCAT10_(_0, _1, _2, _3, _4, _5, _6, _7, _8, _9)         \
+_0 ## _ ## _1 ## _ ## _2 ## _ ## _3 ## _ ## _4 ## _ ## _5 ## _ ## _6 ## _ ## _7 ## _ ## _8 ## _ ## _9
+
+#define __THRD_CONCAT10(_0, _1, _2, _3, _4, _5, _6, _7, _8, _9) \
+  __THRD_CONCAT10_(_0, _1, _2, _3, _4, _5, _6, _7, _8, _9)
+
+
+#define __THRD_ABI                              \
+__THRD_CONCAT10(__thrd_abi,                     \
+                __THRD_SUCCESS,                 \
+                __THRD_BUSY,                    \
+                __THRD_ERROR,                   \
+                __THRD_NOMEM,                   \
+                __THRD_TIMEDOUT,                \
+                __MTX_PLAIN,                    \
+                __MTX_RECURSIVE,                \
+                __MTX_TIMED,                    \
+                TSS_DTOR_ITERATIONS             \
+                )
+
+#define __THRD_SHFT(X, Y) (((X) << 3) ^ (Y))
+
+enum {
+  __thrd_abi =
+  __THRD_SHFT(sizeof(cnd_t),
+              __THRD_SHFT(sizeof(mtx_t),
+                          __THRD_SHFT(sizeof(once_flag),
+                                      __THRD_SHFT(sizeof(thrd_t),
+                                                  sizeof(tss_t))))),
+};
+
+extern unsigned const __THRD_ABI;
+
+#define __THRD_ABI_CHECK (1/(__THRD_ABI == __thrd_abi))
+
+#if __THRD_EXPERIMENTAL
+# define __THRD_ABI_MARK __attribute__((used)) static unsigned const*const __thrd_abi_mark = &__THRD_ABI
+#else
+# define __THRD_ABI_MARK typedef unsigned __thrd_abi_dummy_type
+#endif
+
+enum {
+  thrd_success = __THRD_SUCCESS,
+  thrd_busy = __THRD_BUSY,
+  thrd_error = __THRD_ERROR,
+  thrd_nomem = __THRD_NOMEM,
+  thrd_timedout = __THRD_TIMEDOUT,
+};
+
+enum {
+  mtx_plain = __MTX_PLAIN,
+  mtx_recursive = __MTX_RECURSIVE,
+  // all mutexes are timed, here. so this is a no-op
+  mtx_timed = __MTX_TIMED,
+};
+
+#define ONCE_FLAG_INIT { ._cntrl = 0, }
+#define thread_local _Thread_local
+
+int thrd_create(thrd_t *, thrd_start_t, void *);
+_Noreturn void thrd_exit(int);
+
+int thrd_detach(thrd_t);
+int thrd_join(thrd_t, int *);
+
+int thrd_sleep(const struct timespec *, struct timespec *);
+void thrd_yield(void);
+
+thrd_t thrd_current(void);
+int thrd_equal(thrd_t, thrd_t);
+#define thrd_equal(A, B) ((A) == (B))
+
+void call_once(once_flag *, void (*)(void));
+
+int mtx_init(mtx_t *, int);
+void mtx_destroy(mtx_t *);
+
+int mtx_lock(mtx_t *);
+int mtx_timedlock(mtx_t *restrict, const struct timespec *restrict);
+int mtx_trylock(mtx_t *);
+int mtx_unlock(mtx_t *);
+
+int cnd_init(cnd_t *);
+void cnd_destroy(cnd_t *);
+
+int cnd_broadcast(cnd_t *);
+int cnd_signal(cnd_t *);
+
+int cnd_timedwait(cnd_t *restrict, mtx_t *restrict, const struct timespec *restrict);
+int cnd_wait(cnd_t *, mtx_t *);
+
+int tss_create(tss_t *, tss_dtor_t);
+void tss_delete(tss_t key);
+
+int tss_set(tss_t, void *);
+void *tss_get(tss_t);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/internal/pthread_impl.h b/src/internal/pthread_impl.h
index 650e811..6f4beab 100644
--- a/src/internal/pthread_impl.h
+++ b/src/internal/pthread_impl.h
@@ -51,6 +51,51 @@ struct __timer {
 	pthread_t thread;
 };
 
+/* For the moment this type has the same layout as
+   pthread_mutex_t. This is subject to change in later versions of the
+   C thread implementation, be it for the simple reason that this
+   would reduce the size of the structure by half. */
+struct __mtx_on_heap {
+	int _mt_typ;
+	int _mt_lck;
+	int _mt_wts;
+	int _mt_pad3;
+	int _mt_pad4;
+	int _mt_cnt;
+	int _mt_rfs;
+	int _mt_pad7;
+	int _mt_pad8;
+	int _mt_pad9;
+};
+
+static inline
+void __mtx_addref(struct __mtx_on_heap *m) {
+	if (m) {
+		a_inc(&m->_mt_rfs);
+	}
+}
+
+struct __cnd_on_heap {
+	int rfs;
+	int tok;
+	struct __mtx_on_heap* mtx;
+};
+
+static inline
+void __cnd_addref(struct __cnd_on_heap *c) {
+	if (c) {
+		a_inc(&c->rfs);
+	}
+}
+
+/* Unless told otherwise this C thread implementation will only use
+   private futexes. */
+#ifdef __MTX_SHARED
+#define THRD_PRIVATE 0
+#else
+#define THRD_PRIVATE 128
+#endif
+
 #define __SU (sizeof(size_t)/sizeof(int))
 
 #define _a_stacksize __u.__s[0]
@@ -66,6 +111,12 @@ struct __timer {
 #define _m_prev __u.__p[3]
 #define _m_next __u.__p[4]
 #define _m_count __u.__i[5]
+#define _mx_type __u.__i[0]
+#define _mx_lock __u.__i[1]
+#define _mx_mtx __u.__p[2]
+#define _mx_prev __u.__p[3] /* unused */
+#define _mx_next __u.__p[4] /* unused */
+#define _mx_count __u.__i[5] /* unused */
 #define _c_mutex __u.__p[0]
 #define _c_seq __u.__i[2]
 #define _c_waiters __u.__i[3]
@@ -74,6 +125,14 @@ struct __timer {
 #define _c_lockwait __u.__i[6]
 #define _c_waiters2 __u.__i[7]
 #define _c_destroy __u.__i[8]
+#define _cx_cnd __u.__p[0]
+#define _cx_seq __u.__i[2]     /* unused */
+#define _cx_waiters __u.__i[3] /* unused */
+#define _cx_clock __u.__i[4]   /* unused */
+#define _cx_lock __u.__i[5]
+#define _cx_lockwait __u.__i[6]/* unused */
+#define _cx_waiters2 __u.__i[7]/* unused */
+#define _cx_destroy __u.__i[8] /* unused */
 #define _rw_lock __u.__i[0]
 #define _rw_waiters __u.__i[1]
 #define _b_lock __u.__i[0]
@@ -83,6 +142,24 @@ struct __timer {
 #define _b_waiters2 __u.__i[4]
 #define _b_inst __u.__p[3]
 
+#if defined(__ATOMIC_ACQUIRE) && defined(__ATOMIC_RELEASE)
+#define a_splck(X)                                                      \
+({                                                                      \
+  int * _lck = (X);                                                     \
+  while (__atomic_test_and_set(_lck, __ATOMIC_ACQUIRE)) (void)0;        \
+  __atomic_thread_fence(__ATOMIC_ACQUIRE);                              \
+ })
+#define a_spunl(X) ({ int * _lck = (X); __atomic_clear(_lck, __ATOMIC_RELEASE); })
+#else
+#define a_splck(X)                                      \
+({                                                      \
+  int * _lck = (X);                                     \
+  while (__sync_lock_test_and_set(_lck, 1)) (void)0;    \
+  __sync_synchronize();                                 \
+ })
+#define a_spunl(X) ({ int * _lck = (X); __sync_lock_release(_lck); })
+#endif
+
 #include "pthread_arch.h"
 
 #define SIGTIMER 32
@@ -111,6 +188,10 @@ void __wait(volatile int *, volatile int *, int, int);
 #define __wake(addr, cnt, priv) \
 	__syscall(SYS_futex, addr, FUTEX_WAKE, (cnt)<0?INT_MAX:(cnt))
 
+void __wait_priv(volatile int *, volatile int *, int);
+#define __wake_priv(addr, cnt) \
+	__syscall(SYS_futex, addr, FUTEX_WAKE|THRD_PRIVATE, (cnt)<0?INT_MAX:(cnt))
+
 void __acquire_ptc();
 void __release_ptc();
 void __inhibit_ptc();
diff --git a/src/mman/mprotect.c b/src/mman/mprotect.c
index f488486..535787b 100644
--- a/src/mman/mprotect.c
+++ b/src/mman/mprotect.c
@@ -2,10 +2,12 @@
 #include "libc.h"
 #include "syscall.h"
 
-int mprotect(void *addr, size_t len, int prot)
+int __mprotect(void *addr, size_t len, int prot)
 {
 	size_t start, end;
 	start = (size_t)addr & -PAGE_SIZE;
 	end = (size_t)((char *)addr + len + PAGE_SIZE-1) & -PAGE_SIZE;
 	return syscall(SYS_mprotect, start, end-start, prot);
 }
+
+weak_alias(__mprotect, mprotect);
diff --git a/src/sched/thrd_yield.c b/src/sched/thrd_yield.c
new file mode 100644
index 0000000..301e953
--- /dev/null
+++ b/src/sched/thrd_yield.c
@@ -0,0 +1,7 @@
+#include <sched.h>
+#include "syscall.h"
+
+void thrd_yield()
+{
+         (void)syscall(SYS_sched_yield);
+}
diff --git a/src/thread/__cnd_getref.c b/src/thread/__cnd_getref.c
new file mode 100644
index 0000000..e96d816
--- /dev/null
+++ b/src/thread/__cnd_getref.c
@@ -0,0 +1,64 @@
+#include "pthread_impl.h"
+#include <threads.h>
+
+static inline __cnd_t* __cnd_init(__cnd_t *c, __mtx_t *m) {
+	if (c) {
+		*c = (__cnd_t) {
+			.rfs = 2,
+			 .mtx = m,
+		};
+	}
+	if (m) __mtx_addref(m);
+	return c;
+}
+
+static inline void __cnd_delete(__cnd_t *c) {
+	if (c) {
+		__mtx_unref(c->mtx);
+		c->mtx = 0;
+		free(c);
+	}
+}
+
+void __cnd_unref(__cnd_t *c) {
+	if (c) {
+		if (a_fetch_add(&c->rfs, -1) == 1) {
+			__cnd_delete(c);
+		}
+	}
+}
+
+static inline __cnd_t* __cnd_getref_def(cnd_t *cond, __mtx_t* m, __cnd_t* def) {
+	__cnd_t * ret;
+	__mtx_t * prev = 0;
+	/* Critical section protected by spin lock . */
+	a_splck(&cond->_cx_lock);
+	ret = cond->_cx_cnd;
+	if (ret) {
+		__cnd_addref(ret);
+		if (m && ret->mtx != m) {
+			prev = ret->mtx;
+			__mtx_addref(m);
+			ret->mtx = m;
+		}
+	} else if (def) {
+		ret = def;
+		cond->_cx_cnd = def;
+	}
+	a_spunl(&cond->_cx_lock);
+	if (prev) __mtx_unref(prev);
+	return ret;
+}
+
+__cnd_t* __cnd_getref(cnd_t *cond, __mtx_t * m) {
+	__cnd_t * ret = __cnd_getref_def(cond, m, 0);
+	if (!ret) {
+		__cnd_t * new = __cnd_init(malloc(sizeof *ret), m);
+		if (new) {
+			ret = __cnd_getref_def(cond, m, new);
+			/* somebody sneaked in between the first and second call */
+			if (ret != new) __cnd_delete(new);
+		}
+	}
+	return ret;
+}
diff --git a/src/thread/__mtx_getref.c b/src/thread/__mtx_getref.c
new file mode 100644
index 0000000..22b3db4
--- /dev/null
+++ b/src/thread/__mtx_getref.c
@@ -0,0 +1,49 @@
+#include "pthread_impl.h"
+#include <threads.h>
+
+static inline __mtx_t* __mtx_init(__mtx_t *m, int type) {
+	if (m) {
+		*m = (__mtx_t) {
+			._mt_rfs = 2,
+			 ._mt_typ = type,
+		};
+	}
+	return m;
+}
+
+static inline void __mtx_delete(__mtx_t * m) {
+	free(m);
+}
+
+void __mtx_unref(__mtx_t *m) {
+	if (m) {
+		if (a_fetch_add(&m->_mt_rfs, -1) == 1) {
+			__mtx_delete(m);
+		}
+	}
+}
+
+static inline __mtx_t* __mtx_getref_def(mtx_t *mut, __mtx_t* def) {
+	/* Critical section protected by spin lock . */
+	a_splck(&mut->_mx_lock);
+	__mtx_t * ret = mut->_mx_mtx;
+	if (ret) {
+		__mtx_addref(ret);
+	} else if (def) {
+		ret = def;
+		mut->_mx_mtx = def;
+	}
+	a_spunl(&mut->_mx_lock);
+	return ret;
+}
+
+__mtx_t* __mtx_getref(mtx_t *mut) {
+	__mtx_t * ret = __mtx_getref_def(mut, 0);
+	if (!ret) {
+		__mtx_t * new = __mtx_init(malloc(sizeof *ret), mut->_mx_type);
+		ret = __mtx_getref_def(mut, new);
+		/* somebody sneaked in between the first and second call */
+		if (ret != new) __mtx_delete(new);
+	}
+	return ret;
+}
diff --git a/src/thread/__thrd_abi.c b/src/thread/__thrd_abi.c
new file mode 100644
index 0000000..e5674e6
--- /dev/null
+++ b/src/thread/__thrd_abi.c
@@ -0,0 +1,3 @@
+#include <threads.h>
+
+unsigned const __THRD_ABI =  __thrd_abi;
diff --git a/src/thread/__thrd_wait.c b/src/thread/__thrd_wait.c
new file mode 100644
index 0000000..6351cb6
--- /dev/null
+++ b/src/thread/__thrd_wait.c
@@ -0,0 +1,26 @@
+#include "pthread_impl.h"
+#include <threads.h>
+
+int __clock_gettime(clockid_t clk, struct timespec *ts);
+
+int __thrd_wait(volatile int *addr, int val, const struct timespec *at)
+{
+	int r;
+	struct timespec to, *top=0;
+
+	if (at) {
+		if (at->tv_nsec >= 1000000000UL) return EINVAL;
+		if (__clock_gettime(CLOCK_REALTIME, &to)) return EINVAL;
+		to.tv_sec = at->tv_sec - to.tv_sec;
+		if ((to.tv_nsec = at->tv_nsec - to.tv_nsec) < 0) {
+			to.tv_sec--;
+			to.tv_nsec += 1000000000;
+		}
+		if (to.tv_sec < 0) return ETIMEDOUT;
+		top = &to;
+	}
+
+	r = -__syscall_cp(SYS_futex, addr, FUTEX_WAIT|THRD_PRIVATE, val, top);
+	if (r == EINTR || r == EINVAL || r == ETIMEDOUT || r == EWOULDBLOCK) return r;
+	return 0;
+}
diff --git a/src/thread/__timedwait.c b/src/thread/__timedwait.c
index 302273a..3aed2bd 100644
--- a/src/thread/__timedwait.c
+++ b/src/thread/__timedwait.c
@@ -4,6 +4,9 @@
 #include "futex.h"
 #include "syscall.h"
 
+int __pthread_setcancelstate(int new, int *old);
+int __clock_gettime(clockid_t clk, struct timespec *ts);
+
 static int do_wait(volatile int *addr, int val,
 	clockid_t clk, const struct timespec *at, int priv)
 {
@@ -12,7 +15,7 @@ static int do_wait(volatile int *addr, int val,
 
 	if (at) {
 		if (at->tv_nsec >= 1000000000UL) return EINVAL;
-		if (clock_gettime(clk, &to)) return EINVAL;
+		if (__clock_gettime(clk, &to)) return EINVAL;
 		to.tv_sec = at->tv_sec - to.tv_sec;
 		if ((to.tv_nsec = at->tv_nsec - to.tv_nsec) < 0) {
 			to.tv_sec--;
@@ -33,13 +36,13 @@ int __timedwait(volatile int *addr, int val,
 {
 	int r, cs;
 
-	if (!cleanup) pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, &cs);
+	if (!cleanup) __pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, &cs);
 	pthread_cleanup_push(cleanup, arg);
 
 	r = do_wait(addr, val, clk, at, priv);
 
 	pthread_cleanup_pop(0);
-	if (!cleanup) pthread_setcancelstate(cs, 0);
+	if (!cleanup) __pthread_setcancelstate(cs, 0);
 
 	return r;
 }
diff --git a/src/thread/__wait_priv.c b/src/thread/__wait_priv.c
new file mode 100644
index 0000000..f8358cd
--- /dev/null
+++ b/src/thread/__wait_priv.c
@@ -0,0 +1,14 @@
+#include "pthread_impl.h"
+
+void __wait_priv(volatile int *addr, volatile int *waiters, int val)
+{
+	int spins=10000;
+	while (spins--) {
+		if (*addr==val) a_spin();
+		else return;
+	}
+	if (waiters) a_inc(waiters);
+	while (*addr==val)
+		__syscall(SYS_futex, addr, FUTEX_WAIT|THRD_PRIVATE, val, 0);
+	if (waiters) a_dec(waiters);
+}
diff --git a/src/thread/call_once.c b/src/thread/call_once.c
new file mode 100644
index 0000000..1d3540e
--- /dev/null
+++ b/src/thread/call_once.c
@@ -0,0 +1,40 @@
+#include "pthread_impl.h"
+#include <threads.h>
+
+#define pthread_cleanup_push_static(f, x) do { static struct __ptcb __cb; _pthread_cleanup_push(&__cb, f, x)
+
+static void undo(void *flg)
+{
+	once_flag * flag = flg;
+	a_store(&flag->__cntrl, 0);
+	__wake_priv(&flag->__cntrl, 1);
+}
+
+void call_once(once_flag *flag, void (*func)(void))
+{
+	__THRD_ABI_MARK;
+
+	/* Return immediately if init finished before */
+	if (flag->__cntrl == 2) return;
+
+	/* Try to enter initializing state. Three possibilities:
+	 *  0 - we're the first or the other cancelled; run init
+	 *  1 - another thread is running init; wait
+	 *  2 - another thread finished running init; just return */
+
+	for (;;) switch (a_cas(&flag->__cntrl, 0, 1)) {
+	case 0:
+		_pthread_cleanup_push((void*)&flag->__cb, undo, flag);
+		func();
+		_pthread_cleanup_pop((void*)&flag->__cb, 0);
+
+		a_store(&flag->__cntrl, 2);
+		if (flag->__waiters) __wake(&flag->__cntrl, -1, 0);
+		return;
+	case 1:
+		__wait_priv(&flag->__cntrl, &flag->__waiters, 1);
+		continue;
+	case 2:
+		return;
+	}
+}
diff --git a/src/thread/cnd_broadcast.c b/src/thread/cnd_broadcast.c
new file mode 100644
index 0000000..9e81b64
--- /dev/null
+++ b/src/thread/cnd_broadcast.c
@@ -0,0 +1,48 @@
+#include "pthread_impl.h"
+#include <threads.h>
+
+int __cnd_broadcast(__cnd_t *c)
+{
+	int ret = 0;
+	__mtx_t * mtx = c->mtx;
+	/* If the mtx isn't set, this __cnd has never been used for a wait,
+	   nothing to do. */
+	if (mtx) {
+		/* We can upload as much tokens as we wish, here since this
+		   __cnd_t is already unlinked from its cnd_t. It will never be
+		   used for any new waiter. */
+		a_store(&c->tok, INT_MAX);
+		/* Perform the futex requeue, waking one waiter unless we know
+		 * that the calling thread holds the mutex. */
+                int wakeup = !mtx->_mt_typ || (mtx->_mt_lck&INT_MAX)!=__pthread_self()->tid;
+		ret = __syscall(SYS_futex, &c->tok, FUTEX_REQUEUE|THRD_PRIVATE,
+		                0,
+		                INT_MAX, &mtx->_mt_lck, c->tok);
+                /* Do the bookkeeping for the mutex and wake up one thread eventually. */
+		if (ret > wakeup) a_fetch_add(&mtx->_mt_wts, ret-wakeup);
+		if (ret > 0 && wakeup) ret = __syscall(SYS_futex, &mtx->_mt_lck, FUTEX_WAKE|THRD_PRIVATE, 1);
+	}
+	return  ret < 0 ? thrd_error : thrd_success;
+}
+
+int cnd_broadcast(cnd_t * cond)
+{
+	int ret = thrd_success;
+	/* Critical section protected by spin lock */
+	a_splck(&cond->_cx_lock);
+	__cnd_t * c = cond->_cx_cnd;
+	/* If there are waiters, all will be waiting on c. Since we hold the
+	   spinlock no other waiters can sneak in. Lock them permanently out
+	   of using this one, here. As a consequence we don't have inc the
+	   reference count: their is no change in the total number of
+	   references. */
+	cond->_cx_cnd = 0;
+	a_spunl(&cond->_cx_lock);
+
+	/* If c is 0, there haven't been any waiters, yet, nothing to do. */
+	if (c) {
+		ret = __cnd_broadcast(c);
+		__cnd_unref(c);
+	}
+	return ret;
+}
diff --git a/src/thread/cnd_destroy.c b/src/thread/cnd_destroy.c
new file mode 100644
index 0000000..463b137
--- /dev/null
+++ b/src/thread/cnd_destroy.c
@@ -0,0 +1,13 @@
+#include "pthread_impl.h"
+#include <threads.h>
+
+void cnd_destroy(cnd_t *cond)
+{
+	/* Critical section protected by spin lock */
+	a_splck(&cond->_cx_lock);
+	__cnd_t * ret = cond->_cx_cnd;
+	cond->_cx_cnd = 0;
+	a_spunl(&cond->_cx_lock);
+
+	if (ret) __cnd_unref(ret);
+}
diff --git a/src/thread/cnd_init.c b/src/thread/cnd_init.c
new file mode 100644
index 0000000..9bcd051
--- /dev/null
+++ b/src/thread/cnd_init.c
@@ -0,0 +1,12 @@
+#include "pthread_impl.h"
+#include <threads.h>
+
+int cnd_init(cnd_t * c)
+{
+	*c = (cnd_t) {
+		0
+	};
+	__cnd_t* ret = __cnd_getref(c, 0);
+	__cnd_unref(ret);
+	return ret ? thrd_success : thrd_nomem;
+}
diff --git a/src/thread/cnd_signal.c b/src/thread/cnd_signal.c
new file mode 100644
index 0000000..9ed765b
--- /dev/null
+++ b/src/thread/cnd_signal.c
@@ -0,0 +1,25 @@
+#include "pthread_impl.h"
+#include <threads.h>
+
+int cnd_signal(cnd_t * cond) {
+	int ret = thrd_success;
+	/* Critical section protected by spin lock */
+	a_splck(&cond->_cx_lock);
+	__cnd_t * c = cond->_cx_cnd;
+	if (c) {
+		/* If the mtx isn't set, this __cnd_t has never been used for a
+		   wait, nothing to do. */
+		if (!c->mtx) c = 0;
+		else __cnd_addref(c);
+	}
+	a_spunl(&cond->_cx_lock);
+
+	/* If c is 0, there haven't been any waiters, yet, nothing to do. */
+	if (c) {
+		a_inc(&c->tok);
+		ret = __syscall(SYS_futex, &c->tok, FUTEX_WAKE|THRD_PRIVATE, 1);
+		ret = (ret < 0) ? thrd_error : thrd_success;
+		__cnd_unref(c);
+	}
+	return ret;
+}
diff --git a/src/thread/cnd_timedwait.c b/src/thread/cnd_timedwait.c
new file mode 100644
index 0000000..b34604e
--- /dev/null
+++ b/src/thread/cnd_timedwait.c
@@ -0,0 +1,56 @@
+#include "pthread_impl.h"
+#include <threads.h>
+
+void __pthread_testcancel(void);
+int __clock_gettime(clockid_t clk, struct timespec *ts);
+int __thrd_wait(volatile int *addr, int val, const struct timespec *at);
+
+static int __cnd_timedwait(__cnd_t *restrict c, __mtx_t *restrict mut, const struct timespec *restrict ts)
+{
+	int e, r;
+	__mtx_unlock(mut);
+
+	do {
+		e = 0;
+		/* get a token if some is available */
+		for (int token = c->tok; token > 0;) {
+			int prev = a_cas(&c->tok, token, token - 1);
+			if (prev == token) {
+				goto RELOCK;
+			}
+			token = prev;
+		}
+
+		e = __thrd_wait(&c->tok, 0, ts);
+
+	} while (!e || e == EINTR || e == EWOULDBLOCK);
+
+	if (e != ETIMEDOUT) return thrd_error;
+
+RELOCK:
+	if ((r=__mtx_lock(mut))) return r;
+
+	switch (e) {
+	case 0:
+		return thrd_success;
+	case ETIMEDOUT:
+		return thrd_timedout;
+	default:
+		return thrd_error;
+	}
+}
+
+int cnd_timedwait(cnd_t *restrict cond, mtx_t *restrict mut, const struct timespec *restrict ts)
+{
+	int ret = thrd_error;
+	__mtx_t * m = __mtx_getref(mut);
+	if (m) {
+		__cnd_t * c = __cnd_getref(cond, m);
+		if (c) {
+			ret = __cnd_timedwait(c, m, ts);
+			__cnd_unref(c);
+		}
+		__mtx_unref(m);
+	}
+	return ret;
+}
diff --git a/src/thread/cnd_wait.c b/src/thread/cnd_wait.c
new file mode 100644
index 0000000..91e89db
--- /dev/null
+++ b/src/thread/cnd_wait.c
@@ -0,0 +1,10 @@
+#include <threads.h>
+
+int cnd_wait(cnd_t *cond, mtx_t *mutex)
+{
+	/* Calling cnd_timedwait with a null pointer is an
+	   extension. Such a call is convenient, here since it avoids to
+	   repeat the case analysis that is already done for
+	   cnd_timedwait. */
+	return cnd_timedwait(cond, mutex, 0);
+}
diff --git a/src/thread/mtx_destroy.c b/src/thread/mtx_destroy.c
new file mode 100644
index 0000000..c8e4e68
--- /dev/null
+++ b/src/thread/mtx_destroy.c
@@ -0,0 +1,11 @@
+#include "pthread_impl.h"
+#include <threads.h>
+
+void (mtx_destroy)(mtx_t *mut) {
+	/* Critical section protected by spin lock */
+	a_splck(&mut->_mx_lock);
+	__mtx_t * ret = mut->_mx_mtx;
+	mut->_mx_mtx = 0;
+	a_spunl(&mut->_mx_lock);
+	if (ret) __mtx_unref(ret);
+}
diff --git a/src/thread/mtx_init.c b/src/thread/mtx_init.c
new file mode 100644
index 0000000..40cc6c1
--- /dev/null
+++ b/src/thread/mtx_init.c
@@ -0,0 +1,12 @@
+#include "pthread_impl.h"
+#include <threads.h>
+
+int mtx_init(mtx_t * m, int type)
+{
+	*m = (mtx_t) {
+		._mx_type = ((type&mtx_recursive) ? PTHREAD_MUTEX_RECURSIVE : 0),
+	};
+	__mtx_t* ret = __mtx_getref(m);
+	__mtx_unref(ret);
+	return ret ? thrd_success : thrd_error;
+}
diff --git a/src/thread/mtx_lock.c b/src/thread/mtx_lock.c
new file mode 100644
index 0000000..59bc1b2
--- /dev/null
+++ b/src/thread/mtx_lock.c
@@ -0,0 +1,13 @@
+#include "pthread_impl.h"
+#include <threads.h>
+
+int __mtx_lock(__mtx_t *mut)
+{
+	return __mtx_timedlock(mut, 0);
+}
+
+int mtx_lock(mtx_t *mut)
+{
+	__THRD_ABI_MARK;
+	return mtx_timedlock(mut, 0);
+}
diff --git a/src/thread/mtx_timedlock.c b/src/thread/mtx_timedlock.c
new file mode 100644
index 0000000..cf52749
--- /dev/null
+++ b/src/thread/mtx_timedlock.c
@@ -0,0 +1,46 @@
+#include "pthread_impl.h"
+#include <threads.h>
+
+int __thrd_wait(volatile int *addr, int val, const struct timespec *at);
+
+int __mtx_timedlock(__mtx_t *restrict mtx, const struct timespec *restrict at)
+{
+	int r, t;
+
+	if (mtx->_mt_typ == PTHREAD_MUTEX_NORMAL && !a_cas(&mtx->_mt_lck, 0, thrd_busy))
+		return thrd_success;
+
+	for (;;) {
+		r=__mtx_trylock(mtx);
+		if (r != thrd_busy) return r;
+		else {
+			if (!(r=mtx->_mt_lck) || (r&0x40000000)) continue;
+			a_inc(&mtx->_mt_wts);
+			t = r | 0x80000000;
+			a_cas(&mtx->_mt_lck, r, t);
+			r = __thrd_wait(&mtx->_mt_lck, t, at);
+			a_dec(&mtx->_mt_wts);
+			switch (r) {
+			case 0:
+				break;
+			case EINTR:
+				break;
+			case EWOULDBLOCK:
+				break;
+			case ETIMEDOUT:
+				return thrd_timedout;
+			default:
+				return thrd_error;
+			}
+		}
+	}
+}
+
+int mtx_timedlock(mtx_t *restrict mut, const struct timespec *restrict at)
+{
+	__mtx_t * m = __mtx_getref(mut);
+	if (!m) return thrd_error;
+	int ret = __mtx_timedlock(m, at);
+	__mtx_unref(m);
+	return ret;
+}
diff --git a/src/thread/mtx_trylock.c b/src/thread/mtx_trylock.c
new file mode 100644
index 0000000..852fa15
--- /dev/null
+++ b/src/thread/mtx_trylock.c
@@ -0,0 +1,33 @@
+#include "pthread_impl.h"
+#include <threads.h>
+
+int __mtx_trylock(__mtx_t *mtx)
+{
+	int tid, old, own, ret = 0;
+	pthread_t self;
+	if (mtx->_mt_typ == PTHREAD_MUTEX_NORMAL) {
+		return a_cas(&mtx->_mt_lck, 0, thrd_busy) & thrd_busy;
+	} else {
+		self = __pthread_self();
+		tid = self->tid;
+		old = mtx->_mt_lck;
+		own = old & 0x7fffffff;
+		if (own == tid) {
+			if ((unsigned)mtx->_mt_cnt >= INT_MAX) return thrd_error;
+			else mtx->_mt_cnt++;
+		} else {
+			if ((own && !(own & 0x40000000)) || a_cas(&mtx->_mt_lck, old, tid)!=old)
+				return thrd_busy;
+		}
+	}
+	return thrd_success;
+}
+
+int mtx_trylock(mtx_t *mut)
+{
+	__mtx_t * m = __mtx_getref(mut);
+	if (!m) return thrd_error;
+	int ret = __mtx_trylock(m);
+	__mtx_unref(m);
+	return ret;
+}
diff --git a/src/thread/mtx_unlock.c b/src/thread/mtx_unlock.c
new file mode 100644
index 0000000..5c98b07
--- /dev/null
+++ b/src/thread/mtx_unlock.c
@@ -0,0 +1,28 @@
+#include "pthread_impl.h"
+#include <threads.h>
+
+int __mtx_unlock(__mtx_t *mtx)
+{
+	int ret = thrd_success;
+	if ((mtx->_mt_typ&3) == PTHREAD_MUTEX_RECURSIVE && mtx->_mt_cnt) {
+		if ((mtx->_mt_lck&0x1fffffff) != __pthread_self()->tid)
+			ret = thrd_error;
+		/* _m_count is the count of additional locks, no need to real unlock */
+		else --mtx->_mt_cnt;
+	} else {
+		if (a_swap(&mtx->_mt_lck, 0)<0 || mtx->_mt_wts)
+			__syscall(SYS_futex, &mtx->_mt_lck, FUTEX_WAKE|THRD_PRIVATE, 1);
+	}
+	return ret;
+}
+
+int mtx_unlock(mtx_t *mut)
+{
+	int ret = thrd_error;
+	__mtx_t * m = __mtx_getref(mut);
+	if (m) {
+		ret = __mtx_unlock(m);
+		__mtx_unref(m);
+	}
+	return ret;
+}
diff --git a/src/thread/pthread_cond_timedwait.c b/src/thread/pthread_cond_timedwait.c
index 99d62cc..f43ca9e 100644
--- a/src/thread/pthread_cond_timedwait.c
+++ b/src/thread/pthread_cond_timedwait.c
@@ -1,5 +1,7 @@
 #include "pthread_impl.h"
 
+void __pthread_testcancel(void);
+
 struct cm {
 	pthread_cond_t *c;
 	pthread_mutex_t *m;
@@ -47,7 +49,7 @@ int pthread_cond_timedwait(pthread_cond_t *restrict c, pthread_mutex_t *restrict
 	if (ts && ts->tv_nsec >= 1000000000UL)
 		return EINVAL;
 
-	pthread_testcancel();
+	__pthread_testcancel();
 
 	a_inc(&c->_c_waiters);
 
diff --git a/src/thread/pthread_create.c b/src/thread/pthread_create.c
index e77e54a..d19d78b 100644
--- a/src/thread/pthread_create.c
+++ b/src/thread/pthread_create.c
@@ -3,97 +3,21 @@
 #include "stdio_impl.h"
 #include "libc.h"
 #include <sys/mman.h>
+#include <threads.h>
+
+void *__mmap(void *, size_t, int, int, int, off_t);
+int __munmap(void *, size_t);
+int __mprotect(void *, size_t, int);
+void __thread_enable(void);
+_Noreturn void __pthread_exit(void *);
+void *__copy_tls(unsigned char *);
+extern volatile size_t __pthread_tsd_size;
 
 static void dummy_0()
 {
 }
 weak_alias(dummy_0, __acquire_ptc);
 weak_alias(dummy_0, __release_ptc);
-weak_alias(dummy_0, __pthread_tsd_run_dtors);
-
-_Noreturn void pthread_exit(void *result)
-{
-	pthread_t self = __pthread_self();
-	sigset_t set;
-
-	self->result = result;
-
-	while (self->cancelbuf) {
-		void (*f)(void *) = self->cancelbuf->__f;
-		void *x = self->cancelbuf->__x;
-		self->cancelbuf = self->cancelbuf->__next;
-		f(x);
-	}
-
-	__pthread_tsd_run_dtors();
-
-	__lock(self->exitlock);
-
-	/* Mark this thread dead before decrementing count */
-	__lock(self->killlock);
-	self->dead = 1;
-
-	/* Block all signals before decrementing the live thread count.
-	 * This is important to ensure that dynamically allocated TLS
-	 * is not under-allocated/over-committed, and possibly for other
-	 * reasons as well. */
-	__block_all_sigs(&set);
-
-	/* Wait to unlock the kill lock, which governs functions like
-	 * pthread_kill which target a thread id, until signals have
-	 * been blocked. This precludes observation of the thread id
-	 * as a live thread (with application code running in it) after
-	 * the thread was reported dead by ESRCH being returned. */
-	__unlock(self->killlock);
-
-	/* It's impossible to determine whether this is "the last thread"
-	 * until performing the atomic decrement, since multiple threads
-	 * could exit at the same time. For the last thread, revert the
-	 * decrement and unblock signals to give the atexit handlers and
-	 * stdio cleanup code a consistent state. */
-	if (a_fetch_add(&libc.threads_minus_1, -1)==0) {
-		libc.threads_minus_1 = 0;
-		__restore_sigs(&set);
-		exit(0);
-	}
-
-	if (self->locale != &libc.global_locale) {
-		a_dec(&libc.uselocale_cnt);
-		if (self->locale->ctype_utf8)
-			a_dec(&libc.bytelocale_cnt_minus_1);
-	}
-
-	if (self->detached && self->map_base) {
-		/* Detached threads must avoid the kernel clear_child_tid
-		 * feature, since the virtual address will have been
-		 * unmapped and possibly already reused by a new mapping
-		 * at the time the kernel would perform the write. In
-		 * the case of threads that started out detached, the
-		 * initial clone flags are correct, but if the thread was
-		 * detached later (== 2), we need to clear it here. */
-		if (self->detached == 2) __syscall(SYS_set_tid_address, 0);
-
-		/* The following call unmaps the thread's stack mapping
-		 * and then exits without touching the stack. */
-		__unmapself(self->map_base, self->map_size);
-	}
-
-	for (;;) __syscall(SYS_exit, 0);
-}
-
-void __do_cleanup_push(struct __ptcb *cb)
-{
-	if (!libc.has_thread_pointer) return;
-	struct pthread *self = __pthread_self();
-	cb->__next = self->cancelbuf;
-	self->cancelbuf = cb;
-}
-
-void __do_cleanup_pop(struct __ptcb *cb)
-{
-	if (!libc.has_thread_pointer) return;
-	__pthread_self()->cancelbuf = cb->__next;
-}
 
 static int start(void *p)
 {
@@ -115,28 +39,10 @@ static int start(void *p)
 
 #define ROUND(x) (((x)+PAGE_SIZE-1)&-PAGE_SIZE)
 
-/* pthread_key_create.c overrides this */
-static volatile size_t dummy = 0;
-weak_alias(dummy, __pthread_tsd_size);
-static void *dummy_tsd[1] = { 0 };
-weak_alias(dummy_tsd, __pthread_tsd_main);
-
-static FILE *volatile dummy_file = 0;
-weak_alias(dummy_file, __stdin_used);
-weak_alias(dummy_file, __stdout_used);
-weak_alias(dummy_file, __stderr_used);
-
-static void init_file_lock(FILE *f)
-{
-	if (f && f->lock<0) f->lock = 0;
-}
-
-void *__copy_tls(unsigned char *);
-
 int pthread_create(pthread_t *restrict res, const pthread_attr_t *restrict attrp, void *(*entry)(void *), void *restrict arg)
 {
 	int ret;
-	size_t size, guard;
+	size_t size, guard = 0;
 	struct pthread *self, *new;
 	unsigned char *map = 0, *stack = 0, *tsd = 0, *stack_limit;
 	unsigned flags = CLONE_VM | CLONE_FS | CLONE_FILES | CLONE_SIGHAND
@@ -147,16 +53,7 @@ int pthread_create(pthread_t *restrict res, const pthread_attr_t *restrict attrp
 
 	if (!libc.can_do_threads) return ENOSYS;
 	self = __pthread_self();
-	if (!libc.threaded) {
-		for (FILE *f=libc.ofl_head; f; f=f->next)
-			init_file_lock(f);
-		init_file_lock(__stdin_used);
-		init_file_lock(__stdout_used);
-		init_file_lock(__stderr_used);
-		__syscall(SYS_rt_sigprocmask, SIG_UNBLOCK, SIGPT_SET, 0, _NSIG/8);
-		self->tsd = (void **)__pthread_tsd_main;
-		libc.threaded = 1;
-	}
+	if (!libc.threaded) __thread_enable();
 	if (attrp) attr = *attrp;
 
 	__acquire_ptc();
@@ -184,14 +81,14 @@ int pthread_create(pthread_t *restrict res, const pthread_attr_t *restrict attrp
 
 	if (!tsd) {
 		if (guard) {
-			map = mmap(0, size, PROT_NONE, MAP_PRIVATE|MAP_ANON, -1, 0);
+			map = __mmap(0, size, PROT_NONE, MAP_PRIVATE|MAP_ANON, -1, 0);
 			if (map == MAP_FAILED) goto fail;
-			if (mprotect(map+guard, size-guard, PROT_READ|PROT_WRITE)) {
-				munmap(map, size);
+			if (__mprotect(map+guard, size-guard, PROT_READ|PROT_WRITE)) {
+				__munmap(map, size);
 				goto fail;
 			}
 		} else {
-			map = mmap(0, size, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANON, -1, 0);
+			map = __mmap(0, size, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANON, -1, 0);
 			if (map == MAP_FAILED) goto fail;
 		}
 		tsd = map + size - __pthread_tsd_size;
@@ -233,7 +130,7 @@ int pthread_create(pthread_t *restrict res, const pthread_attr_t *restrict attrp
 
 	if (ret < 0) {
 		a_dec(&libc.threads_minus_1);
-		if (map) munmap(map, size);
+		if (map) __munmap(map, size);
 		return EAGAIN;
 	}
 
diff --git a/src/thread/pthread_exit.c b/src/thread/pthread_exit.c
new file mode 100644
index 0000000..c6f4e60
--- /dev/null
+++ b/src/thread/pthread_exit.c
@@ -0,0 +1,128 @@
+#define _GNU_SOURCE
+#include "pthread_impl.h"
+#include "stdio_impl.h"
+#include "libc.h"
+#include <sys/mman.h>
+#include <threads.h>
+
+static void dummy_0()
+{
+}
+weak_alias(dummy_0, __pthread_tsd_run_dtors);
+weak_alias(dummy_0, __tss_run_dtors);
+
+_Noreturn void __pthread_exit(void *result)
+{
+	pthread_t self = __pthread_self();
+	sigset_t set;
+
+	self->result = result;
+
+	while (self->cancelbuf) {
+		void (*f)(void *) = self->cancelbuf->__f;
+		void *x = self->cancelbuf->__x;
+		self->cancelbuf = self->cancelbuf->__next;
+		f(x);
+	}
+
+	__pthread_tsd_run_dtors();
+
+	__tss_run_dtors();
+
+	__lock(self->exitlock);
+
+	/* Mark this thread dead before decrementing count */
+	__lock(self->killlock);
+	self->dead = 1;
+
+	/* Block all signals before decrementing the live thread count.
+	 * This is important to ensure that dynamically allocated TLS
+	 * is not under-allocated/over-committed, and possibly for other
+	 * reasons as well. */
+	__block_all_sigs(&set);
+
+	/* Wait to unlock the kill lock, which governs functions like
+	 * pthread_kill which target a thread id, until signals have
+	 * been blocked. This precludes observation of the thread id
+	 * as a live thread (with application code running in it) after
+	 * the thread was reported dead by ESRCH being returned. */
+	__unlock(self->killlock);
+
+	/* It's impossible to determine whether this is "the last thread"
+	 * until performing the atomic decrement, since multiple threads
+	 * could exit at the same time. For the last thread, revert the
+	 * decrement and unblock signals to give the atexit handlers and
+	 * stdio cleanup code a consistent state. */
+	if (a_fetch_add(&libc.threads_minus_1, -1)==0) {
+		libc.threads_minus_1 = 0;
+		__restore_sigs(&set);
+		exit(0);
+	}
+
+	if (self->locale != &libc.global_locale) {
+		a_dec(&libc.uselocale_cnt);
+		if (self->locale->ctype_utf8)
+			a_dec(&libc.bytelocale_cnt_minus_1);
+	}
+
+	if (self->detached && self->map_base) {
+		/* Detached threads must avoid the kernel clear_child_tid
+		 * feature, since the virtual address will have been
+		 * unmapped and possibly already reused by a new mapping
+		 * at the time the kernel would perform the write. In
+		 * the case of threads that started out detached, the
+		 * initial clone flags are correct, but if the thread was
+		 * detached later (== 2), we need to clear it here. */
+		if (self->detached == 2) __syscall(SYS_set_tid_address, 0);
+
+		/* The following call unmaps the thread's stack mapping
+		 * and then exits without touching the stack. */
+		__unmapself(self->map_base, self->map_size);
+	}
+
+	for (;;) __syscall(SYS_exit, 0);
+}
+weak_alias(__pthread_exit, pthread_exit);
+
+void __do_cleanup_push(struct __ptcb *cb)
+{
+	if (!libc.has_thread_pointer) return;
+	struct pthread *self = __pthread_self();
+	cb->__next = self->cancelbuf;
+	self->cancelbuf = cb;
+}
+
+void __do_cleanup_pop(struct __ptcb *cb)
+{
+	if (!libc.has_thread_pointer) return;
+	__pthread_self()->cancelbuf = cb->__next;
+}
+
+/* pthread_key_create.c overrides this */
+static volatile size_t dummy = 0;
+weak_alias(dummy, __pthread_tsd_size);
+static void *dummy_tsd[1] = { 0 };
+weak_alias(dummy_tsd, __pthread_tsd_main);
+
+static FILE *volatile dummy_file = 0;
+weak_alias(dummy_file, __stdin_used);
+weak_alias(dummy_file, __stdout_used);
+weak_alias(dummy_file, __stderr_used);
+
+static void init_file_lock(FILE *f)
+{
+	if (f && f->lock<0) f->lock = 0;
+}
+
+void __thread_enable(void)
+{
+	for (FILE *f=libc.ofl_head; f; f=f->next)
+		init_file_lock(f);
+	init_file_lock(__stdin_used);
+	init_file_lock(__stdout_used);
+	init_file_lock(__stderr_used);
+	__syscall(SYS_rt_sigprocmask, SIG_UNBLOCK, SIGPT_SET, 0, _NSIG/8);
+	__pthread_self()->tsd = (void **)__pthread_tsd_main;
+	libc.threaded = 1;
+}
+
diff --git a/src/thread/pthread_setcancelstate.c b/src/thread/pthread_setcancelstate.c
index 060bcdc..2268217 100644
--- a/src/thread/pthread_setcancelstate.c
+++ b/src/thread/pthread_setcancelstate.c
@@ -1,6 +1,6 @@
 #include "pthread_impl.h"
 
-int pthread_setcancelstate(int new, int *old)
+int __pthread_setcancelstate(int new, int *old)
 {
 	if (new > 1U) return EINVAL;
 	if (!libc.has_thread_pointer) return ENOSYS;
@@ -9,3 +9,5 @@ int pthread_setcancelstate(int new, int *old)
 	self->canceldisable = new;
 	return 0;
 }
+
+weak_alias(__pthread_setcancelstate, pthread_setcancelstate);
diff --git a/src/thread/pthread_setcanceltype.c b/src/thread/pthread_setcanceltype.c
index bf0a3f3..d493c1b 100644
--- a/src/thread/pthread_setcanceltype.c
+++ b/src/thread/pthread_setcanceltype.c
@@ -1,11 +1,13 @@
 #include "pthread_impl.h"
 
+void __pthread_testcancel(void);
+
 int pthread_setcanceltype(int new, int *old)
 {
 	struct pthread *self = __pthread_self();
 	if (new > 1U) return EINVAL;
 	if (old) *old = self->cancelasync;
 	self->cancelasync = new;
-	if (new) pthread_testcancel();
+	if (new) __pthread_testcancel();
 	return 0;
 }
diff --git a/src/thread/pthread_testcancel.c b/src/thread/pthread_testcancel.c
index ba5f7c6..ee48e6d 100644
--- a/src/thread/pthread_testcancel.c
+++ b/src/thread/pthread_testcancel.c
@@ -7,7 +7,9 @@ static void dummy()
 
 weak_alias(dummy, __testcancel);
 
-void pthread_testcancel()
+void __pthread_testcancel()
 {
 	__testcancel();
 }
+
+weak_alias(__pthread_testcancel, pthread_testcancel);
diff --git a/src/thread/thrd_create.c b/src/thread/thrd_create.c
new file mode 100644
index 0000000..f72f992
--- /dev/null
+++ b/src/thread/thrd_create.c
@@ -0,0 +1,98 @@
+#define _GNU_SOURCE
+#include "pthread_impl.h"
+#include "stdio_impl.h"
+#include "libc.h"
+#include <sys/mman.h>
+#include <threads.h>
+
+void *__mmap(void *, size_t, int, int, int, off_t);
+int __munmap(void *, size_t);
+int __mprotect(void *, size_t, int);
+void __thread_enable(void);
+_Noreturn void __pthread_exit(void *);
+void *__copy_tls(unsigned char *);
+extern volatile size_t __pthread_tsd_size;
+
+_Noreturn void thrd_exit(int result) {
+	__pthread_exit((void*)(intptr_t)result);
+}
+
+static void dummy_0()
+{
+}
+weak_alias(dummy_0, __acquire_ptc);
+weak_alias(dummy_0, __release_ptc);
+
+static int start(void *p)
+{
+	thrd_t self = p;
+	int (*start)(void*) = (int(*)(void*)) self->start;
+	thrd_exit(start(self->start_arg));
+	return 0;
+}
+
+#define ROUND(x) (((x)+PAGE_SIZE-1)&-PAGE_SIZE)
+
+int thrd_create(thrd_t *res, thrd_start_t entry, void *arg)
+{
+	int ret = -ENOMEM;
+	size_t guard = ROUND(DEFAULT_GUARD_SIZE);
+	size_t size = guard + ROUND(DEFAULT_STACK_SIZE + libc.tls_size +  __pthread_tsd_size);
+	struct pthread *self, *new;
+	unsigned char *map, *stack, *tsd, *stack_limit;
+	unsigned flags = CLONE_VM | CLONE_FS | CLONE_FILES | CLONE_SIGHAND
+		| CLONE_THREAD | CLONE_SYSVSEM | CLONE_SETTLS
+		| CLONE_PARENT_SETTID | CLONE_CHILD_CLEARTID | CLONE_DETACHED;
+
+	if (!libc.can_do_threads) return thrd_error;
+	self = __pthread_self();
+	if (!libc.threaded) __thread_enable();
+
+	__acquire_ptc();
+
+	if (guard) {
+		map = __mmap(0, size, PROT_NONE, MAP_PRIVATE|MAP_ANON, -1, 0);
+		if (map == MAP_FAILED) goto CLEANUP;
+		if (__mprotect(map+guard, size-guard, PROT_READ|PROT_WRITE)) {
+			__munmap(map, size);
+			goto CLEANUP;
+		}
+	} else {
+		map = __mmap(0, size, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANON, -1, 0);
+		if (map == MAP_FAILED) goto CLEANUP;
+	}
+	tsd = map + size - __pthread_tsd_size;
+	stack = tsd - libc.tls_size;
+	stack_limit = map + guard;
+
+	new = __copy_tls(tsd - libc.tls_size);
+	new->map_base = map;
+	new->map_size = size;
+	new->stack = stack;
+	new->stack_size = stack - stack_limit;
+	new->start = (void *(*)(void*))entry;
+	new->start_arg = arg;
+	new->self = new;
+	new->tsd = (void *)tsd;
+	new->locale = &libc.global_locale;
+	new->unblock_cancel = self->cancel;
+	new->canary = self->canary;
+
+	a_inc(&libc.threads_minus_1);
+	ret = __clone(start, stack, flags, new, &new->tid, TP_ADJ(new), &new->tid);
+
+ CLEANUP:
+	__release_ptc();
+
+	if (ret > 0) {
+          *res = new;
+          ret = thrd_success;
+        } else if (ret == -ENOMEM) {
+          ret = thrd_nomem;
+        } else {
+          a_dec(&libc.threads_minus_1);
+          if (map) __munmap(map, size);
+          ret = thrd_error;
+	}
+        return ret;
+}
diff --git a/src/thread/thrd_current.c b/src/thread/thrd_current.c
new file mode 100644
index 0000000..1728535
--- /dev/null
+++ b/src/thread/thrd_current.c
@@ -0,0 +1,11 @@
+#include "pthread_impl.h"
+#include <threads.h>
+
+/* Not all arch have __pthread_self as a symbol. On some this results
+   in some magic. So this "call" to __pthread_self is not necessary a
+   tail call. In particular, other than the appearance, thrd_current
+   can not be implemented as a weak symbol. */
+pthread_t thrd_current()
+{
+	return __pthread_self();
+}
diff --git a/src/thread/thrd_detach.c b/src/thread/thrd_detach.c
new file mode 100644
index 0000000..dd29308
--- /dev/null
+++ b/src/thread/thrd_detach.c
@@ -0,0 +1,17 @@
+#include "pthread_impl.h"
+#include <threads.h>
+#include <sys/mman.h>
+
+int thrd_detach(thrd_t t)
+{
+	/* Cannot detach a thread that's already exiting */
+	if (a_swap(t->exitlock, 1)){
+		int tmp;
+		while ((tmp = t->tid)) __timedwait(&t->tid, tmp, 0, 0, 0, 0, 0);
+		if (t->map_base) munmap(t->map_base, t->map_size);
+	} else {
+		t->detached = 2;
+		__unlock(t->exitlock);
+	}
+	return thrd_success;
+}
diff --git a/src/thread/thrd_equal.c b/src/thread/thrd_equal.c
new file mode 100644
index 0000000..ac49a44
--- /dev/null
+++ b/src/thread/thrd_equal.c
@@ -0,0 +1,6 @@
+#include <threads.h>
+
+int (thrd_equal)(thrd_t a, thrd_t b)
+{
+	return a==b;
+}
diff --git a/src/thread/thrd_join.c b/src/thread/thrd_join.c
new file mode 100644
index 0000000..0446975
--- /dev/null
+++ b/src/thread/thrd_join.c
@@ -0,0 +1,14 @@
+#include "pthread_impl.h"
+#include <sys/mman.h>
+#include <threads.h>
+
+/* C11 threads cannot be canceled, so there is no need for a
+   cancelation function pointer, here. */
+int thrd_join(thrd_t t, int *res)
+{
+	int tmp;
+	while ((tmp = t->tid)) __timedwait(&t->tid, tmp, 0, 0, 0, 0, 0);
+	if (res) *res = (int)(intptr_t)t->result;
+	if (t->map_base) munmap(t->map_base, t->map_size);
+	return thrd_success;
+}
diff --git a/src/thread/tss_create.c b/src/thread/tss_create.c
new file mode 100644
index 0000000..2338d59
--- /dev/null
+++ b/src/thread/tss_create.c
@@ -0,0 +1,97 @@
+#include "pthread_impl.h"
+#include <threads.h>
+
+/* future version of C might prescribe a minimum number of tss */
+#define __TSS_MAX_MIN 64
+
+#ifndef TSS_MAX
+# define TSS_MAX __TSS_MAX_MIN
+#endif
+
+#define __LOW_BIT(X) ((X+0U) & -(X+0U))
+
+#define STRINGIFY_(X) #X
+#define STRINGIFY(X) STRINGIFY_(X)
+#define WARN__(X) _Pragma(#X)
+#define WARN_(X) WARN__(message (X))
+#define WARN(X) WARN_(X)
+
+#if (TSS_MAX) < __TSS_MAX_MIN
+WARN("TSS_MAX " STRINGIFY(TSS_MAX) " replaced by minimal value " STRINGIFY(__TSS_MAX_MIN))
+# undef TSS_MAX
+# define TSS_MAX __TSS_MAX_MIN
+#endif
+
+#if (TSS_MAX) != __LOW_BIT(TSS_MAX)
+# error TSS_MAX must be a power of 2
+#endif
+
+enum { tss_max = (TSS_MAX), };
+
+static tss_dtor_t dtors[tss_max];
+
+static void nodtor(void *dummy)
+{
+}
+
+/* This should use _Thread_local which must exist for C11 threads.
+   Since musl might get compiled with pre-C11 options we use gcc's
+   __thread extension, instead. */
+static __thread void * data[tss_max];
+/* Thread local data is accounted for with a usage counter */
+static __thread tss_t used;
+
+int tss_create(tss_t *tss, tss_dtor_t dtor)
+{
+	unsigned start = (uintptr_t)&tss / 16 % tss_max;
+	if (!dtor) dtor = nodtor;
+
+	for (unsigned i = 0; i < tss_max; ++i) {
+		unsigned j = (start+i)%tss_max;
+		if (!a_cas_p(dtors+j, 0, (void *)dtor)) {
+			*tss = j;
+			return thrd_success;
+		}
+	}
+	*tss = -1;
+	return thrd_error;
+}
+
+void tss_delete(tss_t k)
+{
+	/* tss_delete has no failure path, so check the argument. */
+	if (k < tss_max) dtors[k] = 0;
+}
+
+int tss_set(tss_t k, void *x)
+{
+	if (k >= tss_max || !dtors[k]) return thrd_error;
+	if (data[k] != x) {
+		/* tss_set can be used to add or remove a tss. compute
+		   the difference in usage. */
+		used += (!!x - !!data[k]);
+		data[k] = x;
+	}
+	return thrd_success;
+}
+
+void * tss_get(tss_t k) {
+	return dtors[k] ? data[k] : 0;
+}
+
+
+void __tss_run_dtors(void)
+{
+	for (unsigned j=0; used && j<TSS_DTOR_ITERATIONS; j++) {
+		for (unsigned i=0; i<tss_max; i++) {
+			if (data[i]) {
+				void *tmp = data[i];
+				data[i] = 0;
+				--used;
+                                // slot i might have been deleted, check
+				if (dtors[i]) dtors[i](tmp);
+                               	if (!used) return;
+			}
+		}
+	}
+}
diff --git a/src/time/thrd_sleep.c b/src/time/thrd_sleep.c
new file mode 100644
index 0000000..6570b0c
--- /dev/null
+++ b/src/time/thrd_sleep.c
@@ -0,0 +1,31 @@
+#include <time.h>
+#include <errno.h>
+#include "syscall.h"
+#include "libc.h"
+#include "threads.h"
+
+/* Roughly __syscall already returns the right thing: 0 if all went
+   well or a negative error indication, otherwise. */
+int thrd_sleep(const struct timespec *req, struct timespec *rem)
+{
+  int ret = __syscall(SYS_nanosleep, req, rem);
+  // compile time comparison, constant propagated
+  if (EINTR != 1) {
+    switch (ret) {
+    case 0: return 0;
+      /* error described by POSIX:                                    */
+      /* EINTR  The nanosleep() function was interrupted by a signal. */
+      /* The C11 wording is:                                          */
+      /* -1 if it has been interrupted by a signal                    */
+    case -EINTR:  return -1;
+      /* error described by POSIX */
+    case -EINVAL: return -2;
+      /* described for linux */
+    case -EFAULT: return -3;
+      /* No other error returns are specified. */
+    default: return INT_MIN;
+    }
+  }
+  // potentially a tail call
+  return ret;
+}