Open Chess Game Database Standard

Sopel · Post by **Sopel** » Tue Nov 30, 2021 1:54 am

Dann Corbit wrote: ↑Mon Nov 29, 2021 10:51 pm Re: "And that buffering is precisely what you want to AVOID for files that are meant to be read once."

Code: Select all

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>

char string[32767];
char *getsafe(char *buffer, int count)
{
    char *result = buffer, *np;
    if ((buffer == NULL) || (count < 1))
        result = NULL;
    else if (count == 1)
        *result = '\0';
    else if ((result = fgets(buffer, count, stdin)) != NULL)
        if (np = strchr(buffer, '\n'))
            *np = '\0';
    return result;
}

#include <stdio.h>
char buffer[512]= {0};
int main(int argc, char **argv)
{
    FILE *pFile;
    clock_t start=0,end=0;
    int buftype =0;
    float seconds =0;
    pFile = fopen("C:\\lichess\\lichess_gm_2020-09.pgn","r");
    if (argc >1)
    {
        buftype = atoi(argv[1]);
        if (buftype <0) buftype = 0;
        if (buftype >2) buftype = 2;
    }
    if (buftype == 0) /* full buffering */
    {
        setvbuf ( pFile, NULL, _IOFBF, 32767);
        puts("Full buffering with 32 K");
    }
    else if  (buftype == 1) /* line buffering */
    {
        setvbuf ( pFile, NULL, _IOLBF, 512);
        puts("line buffering with 0.5 K");
    }
    else /* no buffering */
    {
        setvbuf ( pFile, NULL, _IONBF, 0);
        puts("no buffering");
    }
    // File operations here
    if (pFile) {
		    start = clock();
        while (fread(buffer, 1, sizeof buffer, pFile) > 0 ) {
        }
        end = clock();
        fclose (pFile);
    }
    seconds = (float)(end - start) / CLOCKS_PER_SEC;
    printf("Elapsed time is %g seconds.\n", seconds);
    return 0;
}

/*
G:\cc>gcc buftest.c

G:\cc>a 0
Full buffering with 32 K
Elapsed time is 1.027 seconds.

G:\cc>a 1
line buffering with 0.5 K
Elapsed time is 3.34 seconds.

G:\cc>a 2
no buffering
Elapsed time is 5.363 seconds.
*/

11/03/2021 05:39 PM 441,396,530 lichess_gm_2020-09.pgn

Obviously if you're gonna use a 512 byte buffer you're gonna run into issues. Now try something more realistic, like for example 1-4MiB. Also there's no async io here which would close the gap even more.

And as a side note. Looks like I'm mistaken about `setvbuf`, it does not affect caching by OS, that might not be possible to remove in a portable manner.

dangi12012 · Post by **dangi12012** » Tue Nov 30, 2021 1:56 am

Sopel wrote: ↑Tue Nov 30, 2021 1:54 am

Dann Corbit wrote: ↑Mon Nov 29, 2021 10:51 pm Re: "And that buffering is precisely what you want to AVOID for files that are meant to be read once."

Code: Select all

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>

char string[32767];
char *getsafe(char *buffer, int count)
{
    char *result = buffer, *np;
    if ((buffer == NULL) || (count < 1))
        result = NULL;
    else if (count == 1)
        *result = '\0';
    else if ((result = fgets(buffer, count, stdin)) != NULL)
        if (np = strchr(buffer, '\n'))
            *np = '\0';
    return result;
}

#include <stdio.h>
char buffer[512]= {0};
int main(int argc, char **argv)
{
    FILE *pFile;
    clock_t start=0,end=0;
    int buftype =0;
    float seconds =0;
    pFile = fopen("C:\\lichess\\lichess_gm_2020-09.pgn","r");
    if (argc >1)
    {
        buftype = atoi(argv[1]);
        if (buftype <0) buftype = 0;
        if (buftype >2) buftype = 2;
    }
    if (buftype == 0) /* full buffering */
    {
        setvbuf ( pFile, NULL, _IOFBF, 32767);
        puts("Full buffering with 32 K");
    }
    else if  (buftype == 1) /* line buffering */
    {
        setvbuf ( pFile, NULL, _IOLBF, 512);
        puts("line buffering with 0.5 K");
    }
    else /* no buffering */
    {
        setvbuf ( pFile, NULL, _IONBF, 0);
        puts("no buffering");
    }
    // File operations here
    if (pFile) {
		    start = clock();
        while (fread(buffer, 1, sizeof buffer, pFile) > 0 ) {
        }
        end = clock();
        fclose (pFile);
    }
    seconds = (float)(end - start) / CLOCKS_PER_SEC;
    printf("Elapsed time is %g seconds.\n", seconds);
    return 0;
}

/*
G:\cc>gcc buftest.c

G:\cc>a 0
Full buffering with 32 K
Elapsed time is 1.027 seconds.

G:\cc>a 1
line buffering with 0.5 K
Elapsed time is 3.34 seconds.

G:\cc>a 2
no buffering
Elapsed time is 5.363 seconds.
*/

11/03/2021 05:39 PM 441,396,530 lichess_gm_2020-09.pgn

Obviously if you're gonna use a 512 byte buffer you're gonna run into issues. Now try something more realistic, like for example 1-4MiB. Also there's no async io here which would close the gap even more.

And as a side note. Looks like I'm mistaken about `setvbuf`, it does not affect caching by OS, that might not be possible to remove in a portable manner.

Use the Page size. But memory mapped io is superior.

Sopel · Post by **Sopel** » Tue Nov 30, 2021 1:58 am

dangi12012 wrote: ↑Tue Nov 30, 2021 1:56 am

Sopel wrote: ↑Tue Nov 30, 2021 1:54 am

Dann Corbit wrote: ↑Mon Nov 29, 2021 10:51 pm Re: "And that buffering is precisely what you want to AVOID for files that are meant to be read once."

Code: Select all

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>

char string[32767];
char *getsafe(char *buffer, int count)
{
    char *result = buffer, *np;
    if ((buffer == NULL) || (count < 1))
        result = NULL;
    else if (count == 1)
        *result = '\0';
    else if ((result = fgets(buffer, count, stdin)) != NULL)
        if (np = strchr(buffer, '\n'))
            *np = '\0';
    return result;
}

#include <stdio.h>
char buffer[512]= {0};
int main(int argc, char **argv)
{
    FILE *pFile;
    clock_t start=0,end=0;
    int buftype =0;
    float seconds =0;
    pFile = fopen("C:\\lichess\\lichess_gm_2020-09.pgn","r");
    if (argc >1)
    {
        buftype = atoi(argv[1]);
        if (buftype <0) buftype = 0;
        if (buftype >2) buftype = 2;
    }
    if (buftype == 0) /* full buffering */
    {
        setvbuf ( pFile, NULL, _IOFBF, 32767);
        puts("Full buffering with 32 K");
    }
    else if  (buftype == 1) /* line buffering */
    {
        setvbuf ( pFile, NULL, _IOLBF, 512);
        puts("line buffering with 0.5 K");
    }
    else /* no buffering */
    {
        setvbuf ( pFile, NULL, _IONBF, 0);
        puts("no buffering");
    }
    // File operations here
    if (pFile) {
		    start = clock();
        while (fread(buffer, 1, sizeof buffer, pFile) > 0 ) {
        }
        end = clock();
        fclose (pFile);
    }
    seconds = (float)(end - start) / CLOCKS_PER_SEC;
    printf("Elapsed time is %g seconds.\n", seconds);
    return 0;
}

/*
G:\cc>gcc buftest.c

G:\cc>a 0
Full buffering with 32 K
Elapsed time is 1.027 seconds.

G:\cc>a 1
line buffering with 0.5 K
Elapsed time is 3.34 seconds.

G:\cc>a 2
no buffering
Elapsed time is 5.363 seconds.
*/

11/03/2021 05:39 PM 441,396,530 lichess_gm_2020-09.pgn

Obviously if you're gonna use a 512 byte buffer you're gonna run into issues. Now try something more realistic, like for example 1-4MiB. Also there's no async io here which would close the gap even more.

And as a side note. Looks like I'm mistaken about `setvbuf`, it does not affect caching by OS, that might not be possible to remove in a portable manner.

Use the Page size. But memory mapped io is superior.

After all the reasons I specified of why memory mapped io is NOT FREAKING SUPERIOR IN THIS CASE you dare you tell me it is WITHOUT PROVIDING A SINGLE REASONABLE ARGUMENT for why that would be the case.

Dann Corbit · Post by **Dann Corbit** » Tue Nov 30, 2021 10:31 am

if (buftype == 0) /* full buffering */
{
setvbuf ( pFile, NULL, _IOFBF, 32767);
puts("Full buffering with 32 K");
}
else if (buftype == 1) /* line buffering */
{
setvbuf ( pFile, NULL, _IOLBF, 512);
puts("line buffering with 0.5 K");
}
else /* no buffering */
{
setvbuf ( pFile, NULL, _IONBF, 0);
puts("no buffering");
}

Obviously if you're gonna use a 512 byte buffer you're gonna run into issues. Now try something more realistic, like for example 1-4MiB. Also there's no async io here which would close the gap even more.

As you can see above, I used 32K for full buffering, 512 bytes for line buffering, and 0 bytes for no buffering.
Why allocate alot of memory when you are doing line buffering?
I guess that 10 MB buffer size would not add much speed for full buffering. It is a simple experiment to try, of course.

Since I ran full buffering first, line buffering second, and no buffering third, any OS caching would benefit the subsequent methods.
I guess there is very little of that going on, since the file is over 400MB.

To show the superiority of some other method, just write a snippet of code that shows it and we can all try it.
I am and old dog, but I love to learn new tricks.

Sopel · Post by **Sopel** » Tue Nov 30, 2021 11:58 am

Dann Corbit wrote: ↑Tue Nov 30, 2021 10:31 am if (buftype == 0) /* full buffering */
{
setvbuf ( pFile, NULL, _IOFBF, 32767);
puts("Full buffering with 32 K");
}
else if (buftype == 1) /* line buffering */
{
setvbuf ( pFile, NULL, _IOLBF, 512);
puts("line buffering with 0.5 K");
}
else /* no buffering */
{
setvbuf ( pFile, NULL, _IONBF, 0);
puts("no buffering");
}
Obviously if you're gonna use a 512 byte buffer you're gonna run into issues. Now try something more realistic, like for example 1-4MiB. Also there's no async io here which would close the gap even more.
As you can see above, I used 32K for full buffering, 512 bytes for line buffering, and 0 bytes for no buffering.
Why allocate alot of memory when you are doing line buffering?
I guess that 10 MB buffer size would not add much speed for full buffering. It is a simple experiment to try, of course.

Since I ran full buffering first, line buffering second, and no buffering third, any OS caching would benefit the subsequent methods.
I guess there is very little of that going on, since the file is over 400MB.

To show the superiority of some other method, just write a snippet of code that shows it and we can all try it.
I am and old dog, but I love to learn new tricks.

Code: Select all

char buffer[512]= {0};
...
fread(buffer, 1, sizeof buffer, pFile)

Fulvio · Post by **Fulvio** » Tue Nov 30, 2021 12:32 pm

Dann Corbit wrote: ↑Tue Nov 30, 2021 10:31 am As you can see above, I used 32K for full buffering, 512 bytes for line buffering, and 0 bytes for no buffering.
Why allocate alot of memory when you are doing line buffering?
I guess that 10 MB buffer size would not add much speed for full buffering. It is a simple experiment to try, of course.

When you invoke fread() a lot of things happens, you are not reading directly from the disk.
There is a nice image here: https://www.brendangregg.com/perf.html
You call a system library, to kernel mode, file systems, volume manager, device driver...
So your code is measuring the overhead of all those things.
To test if buffering really improves the read speed it is necessary to change

Code: Select all

char buffer[512]= {0}

to

Code: Select all

char buffer[32 * 1024]= {0}

Dann Corbit · Post by **Dann Corbit** » Tue Nov 30, 2021 7:37 pm

Fulvio wrote: ↑Tue Nov 30, 2021 12:32 pm When you invoke fread() a lot of things happens, you are not reading directly from the disk.
There is a nice image here: https://www.brendangregg.com/perf.html
You call a system library, to kernel mode, file systems, volume manager, device driver...
So your code is measuring the overhead of all those things.
To test if buffering really improves the read speed it is necessary to change
Code: Select all
char buffer[512]= {0}
to
Code: Select all
char buffer[32 * 1024]= {0}

I was reading 512 byte maximum strings using line oriented I/O.
I have a 32 K string in the code up above, but it has no utility for reading small lines of text.
You will notice that I was using a safe version of fgets().
Seems a logical thing to do when reading PGN from a file.

BTW, if you are the Fulvio who works on SCID, that is very impressive work.
Hats off.

dangi12012 · Post by **dangi12012** » Tue Nov 30, 2021 10:52 pm

Dann Corbit wrote: ↑Tue Nov 30, 2021 7:37 pm
Fulvio wrote: ↑Tue Nov 30, 2021 12:32 pm When you invoke fread() a lot of things happens, you are not reading directly from the disk.
There is a nice image here: https://www.brendangregg.com/perf.html
You call a system library, to kernel mode, file systems, volume manager, device driver...
So your code is measuring the overhead of all those things.
To test if buffering really improves the read speed it is necessary to change
Code: Select all
char buffer[512]= {0}
to
Code: Select all
char buffer[32 * 1024]= {0}
I was reading 512 byte maximum strings using line oriented I/O.
I have a 32 K string in the code up above, but it has no utility for reading small lines of text.
You will notice that I was using a safe version of fgets().
Seems a logical thing to do when reading PGN from a file.

BTW, if you are the Fulvio who works on SCID, that is very impressive work.
Hats off.

Some people derail the conversation. Having a Open Chess Game Database Standard should have nothing to do with "how big the buffer is in c++".
Its very fast and I am happy that there is a sql standard now. Also its fast enough and can only be made faster by multithreading. Which should be preserved for the topmost level of the application and not a sequential parser.

Can we push it further make every position queryable as well? to make a query for "opening book for player x" possible?

Dann Corbit · Post by **Dann Corbit** » Tue Nov 30, 2021 11:32 pm

To make the positions queryable, you would have to save an EPD record for every unique position in the database,and then store a list of EPD positions.
Not sure what SCID does to achieve queryable positions, but that might be worth examination.

I think a very nice possible outcome would be to have a new version of SCID that sits on top of PGN.
The value for SCID is that a database can be arbitrarily large, have an arbitrary number of players, events,etc.
The value for the tool user is SCID's fabulous functionality, now sitting on top of something that won't die when you add a lot of data.

Side benefit:
Arbitrary SQL queries could be performed against the data.

For instance, here is the count of chessmen on the board (SQL Server version):

Code: Select all

CREATE FUNCTION [dbo].[ChessmanCount] (@s VARCHAR(max))  
RETURNS int  
WITH EXECUTE AS CALLER  
AS  
BEGIN  
     DECLARE @ccount as integer;
     DECLARE @string as VARCHAR(MAX);
     SET @string = @s;
     SET @s =  dbo.GetFirstWord(@s);
     SET @string = UPPER(@string);
     SET @string = dbo.GetFirstWord(@string)
     SET @string = replace(@string, 'P','');
     SET @string = replace(@string, 'N','');
     SET @string = replace(@string, 'B','');
     SET @string = replace(@string, 'R','');
     SET @string = replace(@string, 'Q','');
     SET @string = replace(@string, 'K','');
     SET @ccount = len(@s) - len(@string);
     RETURN(@ccount);  
END
GO

dangi12012 · Post by **dangi12012** » Wed Dec 01, 2021 12:29 am

Dann Corbit wrote: ↑Tue Nov 30, 2021 11:32 pm To make the positions queryable, you would have to save an EPD record for every unique position in the database,and then store a list of EPD positions.
Not sure what SCID does to achieve queryable positions, but that might be worth examination.

I think a very nice possible outcome would be to have a new version of SCID that sits on top of PGN.
The value for SCID is that a database can be arbitrarily large, have an arbitrary number of players, events,etc.
The value for the tool user is SCID's fabulous functionality, now sitting on top of something that won't die when you add a lot of data.

Side benefit:
Arbitrary SQL queries could be performed against the data.

For instance, here is the count of chessmen on the board (SQL Server version):
Code: Select all
CREATE FUNCTION [dbo].[ChessmanCount] (@s VARCHAR(max))  
RETURNS int  
WITH EXECUTE AS CALLER  
AS  
BEGIN  
     DECLARE @ccount as integer;
     DECLARE @string as VARCHAR(MAX);
     SET @string = @s;
     SET @s =  dbo.GetFirstWord(@s);
     SET @string = UPPER(@string);
     SET @string = dbo.GetFirstWord(@string)
     SET @string = replace(@string, 'P','');
     SET @string = replace(@string, 'N','');
     SET @string = replace(@string, 'B','');
     SET @string = replace(@string, 'R','');
     SET @string = replace(@string, 'Q','');
     SET @string = replace(@string, 'K','');
     SET @ccount = len(@s) - len(@string);
     RETURN(@ccount);  
END
GO

I think we had that discussion already. in sql you would only need to store a position ID and a corresponding hash as keys if you want to store billions of positions and millions of games.
The game table is just gameid, move, posID
The position table is posID, hash, position, eval

When you insert you calculate the zobrist hash and increment the running id. Now each move in every game will take 32 bit for the first 4 billion UNIQUE positions. After that it will take 48 bit. So not too bad.

There was a discussion that dies around SCID. Becuase maintaining a binary format IS HARD and takes a lot of time to get right. Thats why a db standard format is now sql.
http://www.talkchess.com/forum3/viewtop ... a2e7338a0e

That would enable this query:

Code: Select all

Select * from position p, game m where playerid = 'lichess1'
where move == 10 and eval < 0.5 group by p.posid

Which is where the player has had a slightly worse position on move 10 grouped by position. Which would enable you to find where your opponent often makes a mistake in one simple query

Open Chess Game Database Standard

Re: Open Chess Game Database Standard

Re: Open Chess Game Database Standard

Re: Open Chess Game Database Standard

Re: Open Chess Game Database Standard

Re: Open Chess Game Database Standard

Re: Open Chess Game Database Standard

Re: Open Chess Game Database Standard

Re: Open Chess Game Database Standard

Re: Open Chess Game Database Standard

Re: Open Chess Game Database Standard