/*
 * Copyright (C) 2007-2009 FAUmachine Team <info@faumachine.org>.
 * This program is free software. You can redistribute it and/or modify it
 * under the terms of the GNU General Public License, either version 2 of
 * the License, or (at your option) any later version. See COPYING.
 */

#include <assert.h>
#include <pthread.h>
#include <stdlib.h>
#include <unistd.h>

#include "glue-main.h"
#include "simsetup.h"

static int nthreads;
static pthread_key_t scheduler_key;
static pthread_key_t current_key;
static struct process *proc_list[2][1024];
static struct process **todo = &proc_list[0][0];
static volatile int todo_count;
static struct process **done = &proc_list[1][0];
static volatile int done_count;
static volatile int end;
static volatile unsigned int running;

static struct process *
sched_todo_dequeue(void)
{
	for (;;) {
		int count;

		count = todo_count;
		if (count == 0) {
			return NULL;
		}
		if (__sync_bool_compare_and_swap(&todo_count, count, count - 1)) {
			return todo[count - 1];
		}
	}
}

static void
sched_done_enqueue(struct process *p)
{
	done[__sync_fetch_and_add(&done_count, 1)] = p;
}

void
sched_process_init(struct process *p, void (*f)(void *), void *s)
{
	p->f = f;
#if defined(__i386__)
	uint32_t *sp;
	uint8_t *sp8;

	sp8 = (uint8_t *) &p->stack[sizeof(p->stack)];

	/* Stack Alignment */
	while(((unsigned long int)sp8 & 15) != 4) {
		*--sp8 = 0;
	}
	sp = (uint32_t *) sp8;
	p->tos = sp;

	*--sp = (uint32_t) s;
	*--sp = 0; /* Return Address */

	*--sp = (uint32_t) f; /* eip */
	*--sp = 0; /* ebx */
	*--sp = 0; /* ecx */
	*--sp = 0; /* edi */
	*--sp = 0; /* esi */
	*--sp = 0; /* ebp */

	p->sp = (void *) sp;

#elif defined(__x86_64__)
	uint64_t *sp;
	uint8_t  *sp8;

	sp8 = (uint8_t *) &p->stack[sizeof(p->stack)];

	/* Stack Alignment */
	while(((unsigned long long int)sp8 & 15) != 0) {
		*--sp8 = 0;
	}
	sp = (uint64_t *) sp8;
	p->tos = sp;

	*--sp = 0; /* Return Address */

	*--sp = (uint64_t) f; /* rip */
	*--sp = 0; /* rax */
	*--sp = 0; /* rbx */
	*--sp = 0; /* rcx */
	*--sp = 0; /* rdx */
	*--sp = (uint64_t) s; /* rdi */
	*--sp = 0; /* rsi */
	*--sp = 0; /* rbp */
	*--sp = 0; /* r8 */
	*--sp = 0; /* r9 */
	*--sp = 0; /* r10 */
	*--sp = 0; /* r11 */
	*--sp = 0; /* r12 */
	*--sp = 0; /* r13 */
	*--sp = 0; /* r14 */
	*--sp = 0; /* r15 */

	p->sp = (void *) sp;
#else
#error Unknown CPU
#endif

	p->inst_cnt = 0;
	p->inst_limit = 0;
	p->state = 1; /* Running */

	sched_done_enqueue(p);
}

/*static*/ void *
sched_sys(void *sp)
{
	struct process *current;
	struct process *scheduler;

	current = pthread_getspecific(current_key);
	scheduler = pthread_getspecific(scheduler_key);

	current->sp = sp;
	return scheduler->sp;
}

/*static*/ void *
sched_usr(void)
{
	struct process *current;

	current = pthread_getspecific(current_key);

	return current->sp;
}

static struct process *
sched_next(void)
{
	static volatile unsigned int round = 0;
	unsigned int myround;
	struct process *p;
	struct process **tmp;

again:	;
	p = sched_todo_dequeue();

	if (! p) {
		myround = round + 1;

		if (__sync_sub_and_fetch(&running, 1) == 0) {
			/* Last thread */
			tmp = done;
			done = todo;
			todo = tmp;
			todo_count = done_count;
			done_count = 0;

			time_advance(todo, todo_count);

			running = nthreads;
			round++;

		} else {
			while (myround != round) {
#if 1
				sched_yield();
#else
				asm("pause\n");
#endif
			}
		}

		if (end) {
			return pthread_getspecific(scheduler_key);
		}

		goto again;
	}

	if (p->inst_limit <= p->inst_cnt) {
		sched_done_enqueue(p);
		goto again;
	}

	return p;
}

/*static*/ void
sched(int slp)
{
	struct process *current;

	current = pthread_getspecific(current_key);

	if (slp) {
		if (0 < __sync_sub_and_fetch(&current->state, 1)) {
			/* Process still in "run" state. */
			sched_done_enqueue(current);
		}
	} else {
		/* Add to "done" queue. */
		sched_done_enqueue(current);
	}

	current = sched_next();

	pthread_setspecific(current_key, current);
}

#if defined(DARWIN)
#define SCHED_SYS	"_sched_sys"
#define SCHED_USR	"_sched_usr"
#define SCHED		"_sched"
#else
#define SCHED_SYS	"sched_sys"
#define SCHED_USR	"sched_usr"
#define SCHED		"sched"
#endif

/* Save process' regs. */ 
#if defined(__i386__)
#define SAVE_REGS \
"	pushl %ebx\n" \
"	pushl %ecx\n" \
"	pushl %edi\n" \
"	pushl %esi\n" \
"	pushl %ebp\n" \
"	pushl %esp\n" \
"	call " SCHED_SYS "\n" \
"	movl %eax, %esp\n"

/* Load process' regs. */
#define RESTORE_REGS \
"	call " SCHED_USR "\n" \
"	movl %eax, %esp\n" \
"	popl %ebp\n" \
"	popl %esi\n" \
"	popl %edi\n" \
"	popl %ecx\n" \
"	popl %ebx\n" \
"	ret\n"

/* Save process' regs. */
#elif defined(__x86_64__)
#define SAVE_REGS \
"	pushq %rax\n" \
"	pushq %rbx\n" \
"	pushq %rcx\n" \
"	pushq %rdx\n" \
"	pushq %rdi\n" \
"	pushq %rsi\n" \
"	pushq %rbp\n" \
"	pushq %r8\n" \
"	pushq %r9\n" \
"	pushq %r10\n" \
"	pushq %r11\n" \
"	pushq %r12\n" \
"	pushq %r13\n" \
"	pushq %r14\n" \
"	pushq %r15\n" \
"	movq %rsp, %rdi\n" \
"	call " SCHED_SYS "\n" \
"	movq %rax, %rsp\n"

/* Load process' regs. */
#define RESTORE_REGS \
"	call " SCHED_USR "\n" \
"	movq %rax, %rsp\n" \
"	popq %r15\n" \
"	popq %r14\n" \
"	popq %r13\n" \
"	popq %r12\n" \
"	popq %r11\n" \
"	popq %r10\n" \
"	popq %r9\n" \
"	popq %r8\n" \
"	popq %rbp\n" \
"	popq %rsi\n" \
"	popq %rdi\n" \
"	popq %rdx\n" \
"	popq %rcx\n" \
"	popq %rbx\n" \
"	popq %rax\n" \
"	retq\n"

#else
#error Unknown CPU
#endif

/* void sched_to_scheduler(void); */
asm (
"	.text\n"
"	.p2align 4\n"
"sched_to_scheduler: .global sched_to_scheduler\n"
	SAVE_REGS

#if defined(__i386__)
"	pushl $0\n" /* Schedule */
"	call " SCHED "\n"
"	addl $4, %esp\n"
#elif defined(__x86_64__)
"	movq $0, %rdi\n" /* Schedule */
"	call " SCHED "\n"
#else
#error Unknown CPU
#endif

	RESTORE_REGS
);

/* void sched_sleep(void); */
asm (
"	.text\n"
"	.p2align 4\n"
"sched_sleep: .global sched_sleep\n"
	SAVE_REGS

#if defined(__i386__)
"	pushl $1\n" /* Schedule */
"	call " SCHED "\n"
"	addl $4, %esp\n"
#elif defined(__x86_64__)
"	movq $1, %rdi\n" /* Schedule */
"	call " SCHED "\n"
#else
#error Unknown CPU
#endif

	RESTORE_REGS
);

void
sched_wakeup(struct process *p)
{
	if (__sync_fetch_and_add(&p->state, 1) == 0) {
		/* Add process to run queue. */
		sched_done_enqueue(p);
	}
}

void
sim_exit(void)
{
	end = 1;
}

static void *
_sched_go(void *dummy)
{
	struct process myself;

	myself.state = 1;

	pthread_setspecific(scheduler_key, &myself);
	pthread_setspecific(current_key, &myself);

	sched_sleep();

	return NULL;
}

void
sched_go(void)
{
	pthread_t *thread;
	int i;
	int ret;

	nthreads = simsetup.multithreaded;
	if (nthreads <= 0) {
		nthreads = sysconf(_SC_NPROCESSORS_ONLN);
	}
	assert(1 <= nthreads);

	fprintf(stderr, "Using %d %s.\n",
			nthreads, nthreads == 1 ? "thread" : "threads");

	ret = pthread_key_create(&scheduler_key, NULL);
	assert(ret == 0);
	ret = pthread_key_create(&current_key, NULL);
	assert(ret == 0);

	thread = malloc(nthreads * sizeof(*thread));
	assert(thread);

	end = 0;
	running = nthreads;
	for (i = 0; i < nthreads - 1; i++) {
		ret = pthread_create(&thread[i], NULL, _sched_go, NULL);
		assert(ret == 0);
	}
	_sched_go(NULL);
	for (i = 0; i < nthreads - 1; i++) {
		ret = pthread_join(thread[i], NULL);
		assert(ret == 0);
	}

	ret = pthread_key_delete(current_key);
	assert(ret == 0);
	ret = pthread_key_delete(scheduler_key);
	assert(ret == 0);
}

void
sched_create(void)
{
	todo_count = 0;
	done_count = 0;
}

void
sched_destroy(void)
{
	/* nothing to do */
}

void
sched_do_susped(FILE *fSched)
{
#if 0
	int writecheck;

	writecheck = fwrite(&proc_first, sizeof(proc_first), 1, fSched);
	writecheck += fwrite(&proc_last, sizeof(proc_last), 1, fSched);

	if (writecheck != 2) {
		fprintf(stderr, "fwrite in sched_do_suspend failed\n");
	}
#endif
}

int
sched_do_resume(FILE *fSched)
{
#if 0
	int readcheck;
	readcheck = fread(&proc_first, sizeof(proc_first), 1, fSched);
	readcheck += fread(&proc_last, sizeof(proc_last), 1, fSched);

	if (readcheck != 2) {
		fprintf(stderr, "fread in sched_do_resume failed\n");
		return -1;
	}

	return sizeof(proc_first) + sizeof(proc_last);
#else
	return 0;
#endif
}

static void
_sched_wakeup(void *_p)
{
	struct process *p = _p;

	sched_wakeup(p);
}


void
sched_delay(unsigned long long delay)
{
	struct process *current;

	current = pthread_getspecific(current_key);
	time_call_at(time_virt() + delay, _sched_wakeup, current);
	
	sched_sleep();
}

void
sched_abort_processes(void)
{
	struct process *current;
	int i;

	current = pthread_getspecific(current_key);
	current->inst_cnt = current->inst_limit;

	for (i = 0; i < todo_count; i++) {
		struct process *p;

		p = todo[i];
		p->inst_cnt = p->inst_limit;
	}

	for (i = 0; i < done_count; i++) {
		struct process *p;

		p = done[i];
		p->inst_cnt = p->inst_limit;
	}
}
