microsecond-accurate timing on Windows

Discussion of chess software programming and technical issues.

Moderators: hgm, Rebel, chrisw

mar
Posts: 2554
Joined: Fri Nov 26, 2010 2:00 pm
Location: Czech Republic
Full name: Martin Sedlak

microsecond-accurate timing on Windows

Post by mar »

I have been thinking recently about time measurement in Windows.
GetTickCount (has granularity of ~15 msec which is bad)
timeGetTime depends on last period set using with timeBeginPeriod,
the docs say it is system-global, so if an app uses timeBeginPeriod(40)
while your program is running, you immediately get worse granularity
than GetTickCount. Also it is said that this affects thread scheduling so better keep hands off it.
QueryPerformanceCounter/QueryPerformanceFrequency:
ultra high resolution, if i remember these calls used to take longer than GetTickCount. Not always available.
rdtsc instruction: very fast but there may be problems with multiple cores and dynamic changes in CPU freq, nonportable
Are there better alternatives I missed?

Now what if I wanted to measure in microseconds instead of some units (usually close to CPU clock freq) which QPF provides?
Here's my solution using QPC/QPF. Note that it's fairly slow on a 32-bit machine here, getMicrosec() itself takes about 1-2 microseconds here () which is already a lot.
Anyway, here's my solution with implementation in case it's useful to someone: (left out some impl. details but I guess it's self-explanatory)

Code: Select all

static volatile signed char init = 0;
static i64 freq;
static u32 shortFreq;
static i64 lastTick;
static i64 remainder = 0;
static i64 emul = 0;
static Mutex usMutex;
static Mutex gMutex;
static i32 lastTC;

i64 getMicrosec()
{
	if ( init == -1 )
	{
		LARGE_INTEGER tmp;
		QueryPerformanceCounter( &tmp );
		i64 cur = (i64)tmp.QuadPart;
		{
			MutexLock _( usMutex );
			i64 delta = cur - lastTick + remainder;
			i64 us = delta * 1000000 / shortFreq;
			emul += us;
			remainder = delta - us * freq / 1000000;
			lastTick = cur;
			return emul;
		}
	}
	if ( init == 1 )
	{
		LARGE_INTEGER tmp;
		QueryPerformanceCounter( &tmp );
		i64 cur = (i64)tmp.QuadPart;
		{
			MutexLock _( usMutex );
			i64 delta = cur - lastTick + remainder;
			i64 us = delta * 1000000 / freq;
			emul += us;
			remainder = delta - us * freq / 1000000;
			lastTick = cur;
			return emul;
		}
	}
	if ( init == 2 )
	{
		MutexLock _( usMutex );
		i32 cur = (i32)GetTickCount();
		i32 delta = cur - lastTC;
		i64 tmp = (i64)delta * 1000;
		lastTC = cur;
		return emul += tmp;
	}
	
	// initialize

	MutexLock _( gMutex );

	LARGE_INTEGER frq;
	if ( QueryPerformanceFrequency( &frq ) == FALSE )
	{
		// not available => use milisec emulation
		init = 2;
		lastTC = GetTickCount();
	} else {
		freq = (i64)frq.QuadPart;
		LARGE_INTEGER tmp;
		QueryPerformanceCounter( &tmp );
		lastTick = (i64)tmp.QuadPart;

		if ( freq <= 0xffffffffU )
		&#123;
			shortFreq = &#40;u32&#41;freq;
			init = -1;
		&#125;
		else
		&#123;
			init = 1;
		&#125;
	&#125;
	return getMicrosec&#40;);
&#125;

i32 getMillisec&#40;)
&#123;
       return &#40;i32&#41;&#40;getMicrosec&#40;)/1000 & 0xffffffffU&#41;;
&#125;
Basically what this does is similar to GetTickCount except that it has a microsecond resolution and uses 64-bit output counter. And getMillisec() is a 1-millisecond accurate version.
Of course if perfcounter is not available the code falls back to emulation using GetTickCount().
mar
Posts: 2554
Joined: Fri Nov 26, 2010 2:00 pm
Location: Czech Republic
Full name: Martin Sedlak

Re: microsecond-accurate timing on Windows

Post by mar »

Final version, enhanced getMicrosec() stability, wraps tested (both pos=>neg and neg=>pos in 2's complement).
Fixed getMillisec(), wraps tested.

Code: Select all

static volatile signed char init = 0;
static u64 freq;
static u32 shortFreq;
static u64 lastTick;
static u64 remainder = 0;
static u64 emul = 0;	//-1000000 + 0*0x7fffffffffffffffLL;
static Mutex usMutex;
static Mutex gMutex;
static u32 lastTC;

i64 getMicrosec&#40;)
&#123;
	if ( init == -1 )
	&#123;
		LARGE_INTEGER tmp;
		QueryPerformanceCounter&#40; &tmp );
		u64 cur = &#40;u64&#41;tmp.QuadPart;
		&#123;
			MutexLock _( usMutex );
			u64 delta = cur - lastTick + remainder;
			if ( delta >= shortFreq )				// enhance stability if interval is longer than one sec
			&#123;
				emul += &#40;delta/shortFreq&#41; * 1000000U;
				delta %= shortFreq;
			&#125;
			u64 us = delta * 1000000U / shortFreq;
			emul += us;
			remainder = delta - us * freq / 1000000U;
			lastTick = cur;
			return &#40;i64&#41;emul;
		&#125;
	&#125;
	if ( init == 1 )
	&#123;
		LARGE_INTEGER tmp;
		QueryPerformanceCounter&#40; &tmp );
		u64 cur = &#40;u64&#41;tmp.QuadPart;
		&#123;
			MutexLock _( usMutex );
			u64 delta = cur - lastTick + remainder;
			if ( delta >= freq )					// enhance stability if interval is longer than one sec
			&#123;
				emul += &#40;delta/freq&#41; * 1000000U;
				delta %= freq;
			&#125;
			u64 us = delta * 1000000U / freq;
			emul += us;
			remainder = delta - us * freq / 1000000U;
			lastTick = cur;
			return &#40;i64&#41;emul;
		&#125;
	&#125;
	if ( init == 2 )
	&#123;
		MutexLock _( usMutex );
		u32 cur = &#40;u32&#41;GetTickCount&#40;);
		u32 delta = cur - lastTC;
		u64 tmp = &#40;u64&#41;delta * 1000;
		lastTC = cur;
		return &#40;i64&#41;&#40;emul += tmp&#41;;
	&#125;
	
	// initialize
       &#123;
	MutexLock _( gMutex );

	LARGE_INTEGER frq;
	if ( QueryPerformanceFrequency&#40; &frq ) == FALSE )
	&#123;
		// not available => use milisec emulation
		init = 2;
		lastTC = &#40;u32&#41;GetTickCount&#40;);
	&#125; else &#123;
		freq = &#40;u64&#41;frq.QuadPart;
		LARGE_INTEGER tmp;
		QueryPerformanceCounter&#40; &tmp );
		lastTick = &#40;u64&#41;tmp.QuadPart;

		if ( freq <= 0xffffffffU )
		&#123;
			shortFreq = &#40;u32&#41;freq;
			init = -1;
		&#125;
		else
		&#123;
			init = 1;
		&#125;
	&#125;
        &#125;
	return getMicrosec&#40;);
&#125;

Code: Select all

static Mutex counterMutex;
static u64 counterUs = 0;
static u32 counterMs = 0;
static int counterInit = 0;

i32 getMillisec&#40;)
&#123;
	u64 us = &#40;u64&#41;getMicrosec&#40;);
	MutexLock _( counterMutex );
	if ( !counterInit )
	&#123;
		counterInit = 1;
		counterUs = us;
		return counterMs;
	&#125;
	u64 delta = us - counterUs;
	if ( delta /= 1000 )
	&#123;
		counterMs += &#40;u32&#41;&#40;delta & 0xffffffff&#41;;
		counterUs += delta * 1000;
	&#125;
	return &#40;i32&#41;counterMs;
&#125;
diep
Posts: 1822
Joined: Thu Mar 09, 2006 11:54 pm
Location: The Netherlands

Re: microsecond-accurate timing on Windows

Post by diep »

from what i understand GetTickCount() in kernel of windows is a simple register move. so it gives time in milliseconds but should be accurate at several nanoseconds, as the latency between the register move and when you receive it is really little. The explanation for that is a plausible one.

With an atomic clock attached you can measure the real accuracy.

In general most systems ugh out if you do too many timing calls per second. Especially big supercomputers which have special processors for time (clock processors), so be careful when measuring.

So when i want to have a CPU spin another round for a few microseconds, what i do is at startup measure how long X spins take with X rather big. Then you divide that back to a bunch of microseconds. You'll deal with some overflows so need 64 bits math to get it right; yet it'll do the job more than ok. Using GetTickCount() for this is more than ok.

Vincent
mar
Posts: 2554
Joined: Fri Nov 26, 2010 2:00 pm
Location: Czech Republic
Full name: Martin Sedlak

Re: microsecond-accurate timing on Windows

Post by mar »

diep wrote:from what i understand GetTickCount() in kernel of windows is a simple register move. so it gives time in milliseconds but should be accurate at several nanoseconds, as the latency between the register move and when you receive it is really little. The explanation for that is a plausible one.

With an atomic clock attached you can measure the real accuracy.

Vincent
I'm not sure how it's implemented. Probably it uses some counter which gets incremented on hardware timer interrupt, who knows.
But the problem with GetTickCount is that it has a resolution/granularity of ~16 milliseconds which makes it useless for most practical purposes. Actually IMO resolution worse than 5 msecs is useless for any realtime application.

After lots of googling I figured out that QPC is not stable on all systems. Lots of people reported instability on some systems (multicore or powersaving), which would mean it uses TSC anyway which of course won't work under these conditions. So I'm definitely dropping QueryPerformanceCounter and switching to timeGetTime. I believe it has 1 msec accuracy on most Windows systems today anyway.
diep
Posts: 1822
Joined: Thu Mar 09, 2006 11:54 pm
Location: The Netherlands

Re: microsecond-accurate timing on Windows

Post by diep »

mar wrote:
diep wrote:from what i understand GetTickCount() in kernel of windows is a simple register move. so it gives time in milliseconds but should be accurate at several nanoseconds, as the latency between the register move and when you receive it is really little. The explanation for that is a plausible one.

With an atomic clock attached you can measure the real accuracy.

Vincent
I'm not sure how it's implemented. Probably it uses some counter which gets incremented on hardware timer interrupt, who knows.
But the problem with GetTickCount is that it has a resolution/granularity of ~16 milliseconds which makes it useless for most practical purposes. Actually IMO resolution worse than 5 msecs is useless for any realtime application.

After lots of googling I figured out that QPC is not stable on all systems. Lots of people reported instability on some systems (multicore or powersaving), which would mean it uses TSC anyway which of course won't work under these conditions. So I'm definitely dropping QueryPerformanceCounter and switching to timeGetTime. I believe it has 1 msec accuracy on most Windows systems today anyway.
Let me write it again. It has granularity internal of within a nanosecond that gets converted. So if it says something has been eating 3 milliseconds, it can be 3.000 or 3.999 but not 4 milliseconds and definitely not 2.1 milliseconds.

So i'm also using this to measure how much system time Diep eats. And if i add up the times, it never has an error of more than 1 millisecond, so the information given there to me seems to be correct that is a register move.
mar
Posts: 2554
Joined: Fri Nov 26, 2010 2:00 pm
Location: Czech Republic
Full name: Martin Sedlak

Re: microsecond-accurate timing on Windows

Post by mar »

diep wrote:So when i want to have a CPU spin another round for a few microseconds, what i do is at startup measure how long X spins take with X rather big.
Yes that's probably the only way to do delays at microsecond and less scale.
What happens if another process demands resources during the calibration? Could happen (in theory).

But what I want is to measure time interval.
diep
Posts: 1822
Joined: Thu Mar 09, 2006 11:54 pm
Location: The Netherlands

Re: microsecond-accurate timing on Windows

Post by diep »

mar wrote:
diep wrote:So when i want to have a CPU spin another round for a few microseconds, what i do is at startup measure how long X spins take with X rather big.
Yes that's probably the only way to do delays at microsecond and less scale.
What happens if another process demands resources during the calibration? Could happen (in theory).

But what I want is to measure time interval.
where do you need this for?

you do realize there is special ways to debug software code using the information the cpu has? That is nanosecond accurate, same trick like kernel is using for GetTickCount()

however many cpu's nowadays throttle and turboboost and whatever, so i always prefer measurements of several seconds of whatever.
mar
Posts: 2554
Joined: Fri Nov 26, 2010 2:00 pm
Location: Czech Republic
Full name: Martin Sedlak

Re: microsecond-accurate timing on Windows

Post by mar »

diep wrote:Let me write it again. It has granularity internal of within a nanosecond that gets converted. So if it says something has been eating 3 milliseconds, it can be 3.000 or 3.999 but not 4 milliseconds and definitely not 2.1 milliseconds.
It depends on system you use, though i doubt you get 1ms granularity with GetTickCount().

Here's what I get on XP using GetTickCount():
delta = 0 ms
delta = 16 ms
delta = 31 ms
delta = 47 ms
delta = 63 ms
...

and using timeGetTime():

delta = 0 ms
delta = 1 ms
delta = 2 ms
delta = 3 ms
delta = 4 ms
...

With other words, anything that takes less that 16 msec, measured using GetTickCount, will report either 0ms or 16ms.
mar
Posts: 2554
Joined: Fri Nov 26, 2010 2:00 pm
Location: Czech Republic
Full name: Martin Sedlak

Re: microsecond-accurate timing on Windows

Post by mar »

diep wrote: where do you need this for?
I have an OGL app and need to measure time between frames to update logic. I get framerates around 150 so 16 ms granularity is not enough for me.
diep wrote: you do realize there is special ways to debug software code using the information the cpu has? That is nanosecond accurate, same trick like kernel is using for GetTickCount()
Yes, you probably mean timestamp counter on pentium and later. There are also hardware debug registers to break at a certain execution address or I/O port access, memory R/W access and so on. Already a 386 could do that.
diep wrote: however many cpu's nowadays throttle and turboboost and whatever, so i always prefer measurements of several seconds of whatever.
I agree.
diep
Posts: 1822
Joined: Thu Mar 09, 2006 11:54 pm
Location: The Netherlands

Re: microsecond-accurate timing on Windows

Post by diep »

mar wrote:
diep wrote: where do you need this for?
I have an OGL app and need to measure time between frames to update logic. I get framerates around 150 so 16 ms granularity is not enough for me.
diep wrote: you do realize there is special ways to debug software code using the information the cpu has? That is nanosecond accurate, same trick like kernel is using for GetTickCount()
Yes, you probably mean timestamp counter on pentium and later. There are also hardware debug registers to break at a certain execution address or I/O port access, memory R/W access and so on. Already a 386 could do that.
diep wrote: however many cpu's nowadays throttle and turboboost and whatever, so i always prefer measurements of several seconds of whatever.
I agree.
Maybe ask in graphics group if you want such turbo framerate in OpenGL.

Hopefully your users are all on some new sort of AICAR drugs, the new undetectable form of EPO, gonna be problem in London. Your users really need some superdrug like that if they want to make chance keep up with 150 frames per second game.