#include <linux/init.h>
#include <linux/module.h>

MODULE_LICENSE("Dual BSD/GPL");

#define GET_TICKS(t)							\
    __asm__ __volatile__ ("xorl %%eax, %%eax\n"				\
			  "cpuid\n"					\
			  "rdtsc\n"					\
			  : "=a" (t) : : "%ebx", "%ecx", "%edx")

#define GET_LATENCY(t)							\
    __asm__ __volatile__ ("xorl %%eax, %%eax\n"				\
			  "cpuid\n"					\
			  "rdtsc\n"					\
			  "movl %%eax, %%esi\n"				\
			  "xorl %%eax, %%eax\n"				\
			  "cpuid\n"					\
			  "rdtsc\n"					\
			  "sub %%esi, %%eax\n"				\
			  : "=a" (t) : : "%ebx", "%ecx", "%edx", "%esi")

static volatile int dummy;

void load()
{
    int i, j;
    int sum = 0;
    unsigned int t1, t2, latency;

    // cpuid needs to be warmed up, so we warm it here
    GET_TICKS(t1);
    GET_LATENCY(latency);

    // Now warm up the actual code (for the pipeline)
    for (i=0; i<1000; i++) {
	for (j=0; j<1000000; j++) sum += j*dummy;
    }

    // Run the benchmark
    GET_TICKS(t1);
    for (i=0; i<1000; i++) {
	for (j=0; j<1000000; j++) sum += j*dummy;
    }
    GET_TICKS(t2);

    printk(KERN_ALERT "Load time = %u, latency = %u\n", t2-t1, latency);
}

static int tracer_init()
{
    spinlock_t lock = SPIN_LOCK_UNLOCKED;
    unsigned long flags;

    spin_lock_irqsave(&lock, flags);
    load();
    spin_unlock_irqrestore(&lock, flags);
    
    return 0;
}

static void tracer_exit()
{
    printk(KERN_ALERT "Bye\n");
}

module_init(tracer_init);
module_exit(tracer_exit);
