Measuring times and optimizing 2D Starfield

Pincha aquí para verlo en español

In the previous tutorial 2D Starfield we learned how to make a nice and simple star field effect. And we saw that is limited to 40 stars simultaneously, because if we put more stars, the program starts to run slow... In this tutorial we will learn to easily measure and optimize the program times all we can.

To measure time, usually we select a routine and analyzes the assembler source code and count the cycles per instruction... a slow and boring process. There are faster ways of measuring a block of code or counting frames / loops per second that a program is achieving. To do this we will use a firmware command called KL TIME PLEASE (BD0D) that returns a counter with the time elapsed since the computer is turned on or reset (in units of 1/300 seconds, ie 3.3333ms). This counter is a 32 bit value (4 bytes), and overflows (starts from 0 again) after about 166 days (32bit = 4294967296/300 = 14316557s / 86400sday = 165.7 days). As we will make measurements much smaller, we can use only 2 bytes of the counter, allowing us to measure times of up to 3.6 minutes. This measurement method is valid as long as interrupts are enabled, otherwise, the counter stops increasing. At the time of writing this tutorial, this method does not work with z88dk, as always interrupts are disabled (I have already reported the problem), so we will use from sdcc.

To read the counter value, we take a simple function:

////////////////////////////////////////////////////////////////////////
unsigned char char1,char2,char3,char4;

unsigned int GetTime()
{
  unsigned int nTime = 0;

  __asm
    CALL #0xBD0D ;KL TIME PLEASE
    PUSH HL
    POP DE
    LD HL, #_char3
    LD (HL), D
    LD HL, #_char4
    LD (HL), E
  __endasm;

  nTime = (char3 << 8) + char4;

  return nTime;
}
////////////////////////////////////////////////////////////////////////

To measure the frames / loops per second which gives the program, simply count the times through the loop delete / move / paint and display the value on the screen every second, the full source code would be this (you can download at the end):

////////////////////////////////////////////////////////////////////////
// star01.c
// Optimizing a 2D Star Field
// Mochilote - www.cpcmania.com
////////////////////////////////////////////////////////////////////////
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

void SetMode0PixelColor(unsigned char *pByteAddress, unsigned char nColor, unsigned char nPixel)
{
  unsigned char nByte = *pByteAddress;

  if(nPixel == 0)
  {
    nByte &= 85;

    if(nColor & 1)
      nByte |= 128;

    if(nColor & 2)
      nByte |= 8;

    if(nColor & 4)
      nByte |= 32;

    if(nColor & 8)
      nByte |= 2;
  }
  else
  {
    nByte &= 170;

    if(nColor & 1)
      nByte |= 64;

    if(nColor & 2)
      nByte |= 4;

    if(nColor & 4)
      nByte |= 16;

    if(nColor & 8)
      nByte |= 1;
  }

  *pByteAddress = nByte;
}

void PutPixelMode0(unsigned char nX, unsigned char nY, unsigned char nColor)
{
  unsigned char nPixel = 0;
  unsigned int nAddress = 0xC000 + ((nY / 8) * 80) + ((nY % 8) * 2048) + (nX / 2);
  nPixel = nX % 2;

  SetMode0PixelColor((unsigned char *)nAddress, nColor, nPixel);
}

////////////////////////////////////////////////////////////////////////
unsigned char char1,char2,char3,char4;

unsigned int GetTime()
{
  unsigned int nTime = 0;

  __asm
    CALL #0xBD0D ;KL TIME PLEASE
    PUSH HL
    POP DE
    LD HL, #_char3
    LD (HL), D
    LD HL, #_char4
    LD (HL), E
  __endasm;

  nTime = (char3 << 8) + char4;

  return nTime;
}
////////////////////////////////////////////////////////////////////////

struct _tStar
{
  unsigned char nX;
  unsigned char nY;
  unsigned char nStarType;
};

#define STARS_NUM 40
struct _tStar aStars[STARS_NUM];

void main()
{
  unsigned int nFPS = 0;
  unsigned int nTimeLast = 0;
  unsigned char nStar = 0;
  memset(aStars, 0, sizeof(aStars));
  
  //SCR_SET_MODE 0
  __asm
    ld a, #0
    call #0xBC0E
  __endasm;

  //PALETE
  __asm
    ld a, #0
    ld b, #0 ;black
    ld c, b
    call #0xBC32 ;SCR SET INK

    ld a, #1
    ld b, #12 ;Yellow
    ld c, b
    call #0xBC32 ;SCR SET INK

    ld a, #2
    ld b, #25 ;Pastel Yellow    
    ld c, b
    call #0xBC32 ;SCR SET INK

    ld a, #3
    ld b, #24 ;Bright Yellow
    ld c, b
    call #0xBC32 ;SCR SET INK
  __endasm;

  //SCR SET BORDER 0
  __asm
    ld b, #0 ;black
    ld c, b
    call #0xBC38
  __endasm;

  //Init
  for(nStar = 0; nStar < STARS_NUM; nStar++)
  {
    aStars[nStar].nX = rand() % 160;
    aStars[nStar].nY = rand() % 200;
    aStars[nStar].nStarType = rand() % 3;
  }

  nTimeLast = GetTime();

  while(1)
  {
    for(nStar = 0; nStar < STARS_NUM; nStar++)
    {
      //delete star
      PutPixelMode0(aStars[nStar].nX, aStars[nStar].nY, 0);

      //move star
      switch(aStars[nStar].nStarType)
      {
        case 0: //slow star
          aStars[nStar].nX += 1;
          break;
        case 1: //medium star
          aStars[nStar].nX += 2;
          break;
        case 2: //fast star
          aStars[nStar].nX += 3;
          break;
      }
      
      if(aStars[nStar].nX >= 160)
      {
        aStars[nStar].nX = 0;
        aStars[nStar].nY = rand() % 200;
        aStars[nStar].nStarType = rand() % 3;
        continue;
      }

      //paint star
      PutPixelMode0(aStars[nStar].nX, aStars[nStar].nY, aStars[nStar].nStarType + 1);
    }

    nFPS++;

    if(GetTime() - nTimeLast >= 300)
    {

      //TXT SET CURSOR 0,0
      __asm
        ld h, #1
        ld l, #1
        call #0xBB75
      __endasm;

      printf("%u  ", nFPS);

      nTimeLast = GetTime();
      nFPS = 0;
    }
  }
}
////////////////////////////////////////////////////////////////////////

If you compile and run on an emulator shows the following:

emulator

That is, the program is running at 29 frames per second. If we increase the stars to 80, we see that the speed drops to 15 frames per second and if we increase to 200 stars we see that the speed drops to 6 frames per second... To see what part of the code is consuming much time, we can for example try removing the paint, commenting on the two calls to PutPixelMode0 (the eraser and paint), leaving only the code that updates the position of the stars, we see that the speed rises to about 110 frames per second. If instead we leave the "erase / paint" of the pixels but commented the code that updates the position of the stars we see that the speed up to 35 frames per second. It is clear that the main problem is in the "erase / paint".

What can we do to optimize this code? At first glance are several things to optimize, for example, when we erase a star, we know that color is 0, but we are calling PutPixelMode0, which internally calls SetMode0PixelColor doing various bit-level operations to finally put value to 0, we could simply avoid putting all the byte to 0 directly. On the other hand, when we erase a star, calling PutPixelMode0 recalculates the position in video memory (with several additions, multiplications and divisions) when we had calculated this value before, when we paint the star. We could simply store the value when painting and so would not have to re-calculate to erase.

If we implement these optimizations in the code would be as follows:

////////////////////////////////////////////////////////////////////////
// star02.c
// Optimizing a 2D Star Field
// Mochilote - www.cpcmania.com
////////////////////////////////////////////////////////////////////////
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

void SetMode0PixelColor(unsigned char *pByteAddress, unsigned char nColor, unsigned char nPixel)
{
  unsigned char nByte = *pByteAddress;

  if(nPixel == 0)
  {
    nByte &= 85;

    if(nColor & 1)
      nByte |= 128;

    if(nColor & 2)
      nByte |= 8;

    if(nColor & 4)
      nByte |= 32;

    if(nColor & 8)
      nByte |= 2;
  }
  else
  {
    nByte &= 170;

    if(nColor & 1)
      nByte |= 64;

    if(nColor & 2)
      nByte |= 4;

    if(nColor & 4)
      nByte |= 16;

    if(nColor & 8)
      nByte |= 1;
  }

  *pByteAddress = nByte;
}

unsigned char *PutPixelMode0(unsigned char nX, unsigned char nY, unsigned char nColor)
{
  unsigned char nPixel = 0;
  unsigned int nAddress = 0xC000 + ((nY / 8) * 80) + ((nY % 8) * 2048) + (nX / 2);
  nPixel = nX % 2;

  SetMode0PixelColor((unsigned char *)nAddress, nColor, nPixel);
  return (unsigned char *)nAddress;
}

////////////////////////////////////////////////////////////////////////
unsigned char char1,char2,char3,char4;

unsigned int GetTime()
{
  unsigned int nTime = 0;

  __asm
    CALL #0xBD0D ;KL TIME PLEASE
    PUSH HL
    POP DE
    LD HL, #_char3
    LD (HL), D
    LD HL, #_char4
    LD (HL), E
  __endasm;

  nTime = (char3 << 8) + char4;

  return nTime;
}
////////////////////////////////////////////////////////////////////////

struct _tStar
{
  unsigned char nX;
  unsigned char nY;
  unsigned char nStarType;
  unsigned char *pVideoAddress;
};

#define STARS_NUM 40
struct _tStar aStars[STARS_NUM];

void main()
{
  unsigned int nFPS = 0;
  unsigned int nTimeLast = 0;
  unsigned char nStar = 0;
  memset(aStars, 0, sizeof(aStars));
  
  //SCR_SET_MODE 0
  __asm
    ld a, #0
    call #0xBC0E
  __endasm;

  //PALETE
  __asm
    ld a, #0
    ld b, #0 ;black
    ld c, b
    call #0xBC32 ;SCR SET INK

    ld a, #1
    ld b, #12 ;Yellow
    ld c, b
    call #0xBC32 ;SCR SET INK

    ld a, #2
    ld b, #25 ;Pastel Yellow    
    ld c, b
    call #0xBC32 ;SCR SET INK

    ld a, #3
    ld b, #24 ;Bright Yellow
    ld c, b
    call #0xBC32 ;SCR SET INK
  __endasm;

  //SCR SET BORDER 0
  __asm
    ld b, #0 ;black
    ld c, b
    call #0xBC38
  __endasm;

  //Init
  for(nStar = 0; nStar < STARS_NUM; nStar++)
  {
    aStars[nStar].nX = rand() % 160;
    aStars[nStar].nY = rand() % 200;
    aStars[nStar].nStarType = rand() % 3;
    aStars[nStar].pVideoAddress = (unsigned char *)0xC000;
  }

  nTimeLast = GetTime();

  while(1)
  {
    for(nStar = 0; nStar < STARS_NUM; nStar++)
    {
      //delete star
      *aStars[nStar].pVideoAddress = 0;

      //move star
      switch(aStars[nStar].nStarType)
      {
        case 0: //slow star
          aStars[nStar].nX += 1;
          break;
        case 1: //medium star
          aStars[nStar].nX += 2;
          break;
        case 2: //fast star
          aStars[nStar].nX += 3;
          break;
      }
      
      if(aStars[nStar].nX >= 160)
      {
        aStars[nStar].nX = 0;
        aStars[nStar].nY = rand() % 200;
        aStars[nStar].nStarType = rand() % 3;
        continue;
      }

      //paint star
      aStars[nStar].pVideoAddress = PutPixelMode0(aStars[nStar].nX, aStars[nStar].nY, aStars[nStar].nStarType + 1);
    }

    nFPS++;

    if(GetTime() - nTimeLast >= 300)
    {
      //TXT SET CURSOR 0,0
      __asm
        ld h, #1
        ld l, #1
        call #0xBB75
      __endasm;

      printf("%u  ", nFPS);

      nTimeLast = GetTime();
      nFPS = 0;
    }
  }
}
////////////////////////////////////////////////////////////////////////

If we run in the emulator will see that the frame rate increases to 41, more than 33% improvement over the original code (which was 29)...

Is it possible to optimize the most? Much more, of course. For example, when a star reaches the right edge, it moves back to the position x = 0 and do a couple of rand () to randomly assign another speed and position on y axis, well, these calls to rand () are expensive, so if we remove, we see that increase the frames per second at 45, ie two random assignments cost us 4 frames per second...

Is it possible to optimize the most? Of course a lot. We have a couple of important things to optimize yet, the first is that every time we paint a pixel we call PutPixelMode0, which calculates the full address in video memory, but the stars only move horizontally, so its value on the y-axis is always constant and we are making several multiplications, divisions and amounts that are constant... The second optimization can be done is to change all access to the main loop to aStars[nStar] for a pointer to the concrete structure.

If we implement these optimizations in the code would be as follows:

////////////////////////////////////////////////////////////////////////
// star03.c
// Optimizing a 2D Star Field
// Mochilote - www.cpcmania.com
////////////////////////////////////////////////////////////////////////
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

unsigned char GetMode0PixelColorByte(unsigned char nColor, unsigned char nPixel)
{
  unsigned char nByte = 0;

  if(nPixel == 0)
  {
    nByte &= 85;

    if(nColor & 1)
      nByte |= 128;

    if(nColor & 2)
      nByte |= 8;

    if(nColor & 4)
      nByte |= 32;

    if(nColor & 8)
      nByte |= 2;
  }
  else
  {
    nByte &= 170;

    if(nColor & 1)
      nByte |= 64;

    if(nColor & 2)
      nByte |= 4;

    if(nColor & 4)
      nByte |= 16;

    if(nColor & 8)
      nByte |= 1;
  }

  return nByte;
}

unsigned char *GetLineAddress(unsigned char nLine)
{
  return (unsigned char *)0xC000 + ((nLine / 8) * 80) + ((nLine % 8) * 2048);
}

////////////////////////////////////////////////////////////////////////
unsigned char char1,char2,char3,char4;

unsigned int GetTime()
{
  unsigned int nTime = 0;

  __asm
    CALL #0xBD0D ;KL TIME PLEASE
    PUSH HL
    POP DE
    LD HL, #_char3
    LD (HL), D
    LD HL, #_char4
    LD (HL), E
  __endasm;

  nTime = (char3 << 8) + char4;

  return nTime;
}
////////////////////////////////////////////////////////////////////////

struct _tStar
{
  unsigned char nX;
  unsigned char nY;
  unsigned char nStarType;
  unsigned char *pLineAddress;
  unsigned char *pCurrentAddress;
};

#define STARS_NUM 40
struct _tStar aStars[STARS_NUM];

void main()
{
  unsigned int nFPS = 0;
  unsigned int nTimeLast = 0;
  unsigned char nStar = 0;
  struct _tStar *pStar = NULL;
  memset(aStars, 0, sizeof(aStars));
  
  //SCR_SET_MODE 0
  __asm
    ld a, #0
    call #0xBC0E
  __endasm;

  //PALETE
  __asm
    ld a, #0
    ld b, #0 ;black
    ld c, b
    call #0xBC32 ;SCR SET INK

    ld a, #1
    ld b, #12 ;Yellow
    ld c, b
    call #0xBC32 ;SCR SET INK

    ld a, #2
    ld b, #25 ;Pastel Yellow    
    ld c, b
    call #0xBC32 ;SCR SET INK

    ld a, #3
    ld b, #24 ;Bright Yellow
    ld c, b
    call #0xBC32 ;SCR SET INK
  __endasm;

  //SCR SET BORDER 0
  __asm
    ld b, #0 ;black
    ld c, b
    call #0xBC38
  __endasm;

  //Init
  for(nStar = 0; nStar < STARS_NUM; nStar++)
  {
    aStars[nStar].nX = rand() % 160;
    aStars[nStar].nY = rand() % 200;
    aStars[nStar].nStarType = rand() % 3;
    aStars[nStar].pLineAddress = GetLineAddress(aStars[nStar].nY);
    aStars[nStar].pCurrentAddress = aStars[nStar].pLineAddress;
  }

  nTimeLast = GetTime();

  while(1)
  {
    for(nStar = 0; nStar < STARS_NUM; nStar++)
    {
      pStar = &aStars[nStar];
      //delete star
      *pStar->pCurrentAddress = 0;

      //move star
      switch(pStar->nStarType)
      {
        case 0: //slow star
          pStar->nX += 1;
          break;
        case 1: //medium star
          pStar->nX += 2;
          break;
        case 2: //fast star
          pStar->nX += 3;
          break;
      }
      
      if(pStar->nX >= 160)
      {
        pStar->nX = 0;
        //pStar->nY = rand() % 200;
        //pStar->nStarType = rand() % 3;
        continue;
      }

      //paint star
      pStar->pCurrentAddress = pStar->pLineAddress + (pStar->nX / 2);
      *pStar->pCurrentAddress = GetMode0PixelColorByte(pStar->nStarType + 1, pStar->nX % 2);
    }

    nFPS++;

    if(GetTime() - nTimeLast >= 300)
    {
      //TXT SET CURSOR 0,0
      __asm
        ld h, #1
        ld l, #1
        call #0xBB75
      __endasm;

      printf("%u  ", nFPS);

      nTimeLast = GetTime();
      nFPS = 0;
    }
  }
}
////////////////////////////////////////////////////////////////////////

With these changes we reach a whopping 77 frames per second (above the capacity of painting of the Amstrad CPC), so we managed to double (and more) frames per second that we had when we started. Finally, if we clean code and delete printf and calls to measure times and we doubled the number of stars from 40 to 80, we get a star field effect with the double stars and more speed than when we started:

////////////////////////////////////////////////////////////////////////
// star04.c
// Optimizing a 2D Star Field
// Mochilote - www.cpcmania.com
////////////////////////////////////////////////////////////////////////
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

unsigned char GetMode0PixelColorByte(unsigned char nColor, unsigned char nPixel)
{
  unsigned char nByte = 0;

  if(nPixel == 0)
  {
    nByte &= 85;

    if(nColor & 1)
      nByte |= 128;

    if(nColor & 2)
      nByte |= 8;

    if(nColor & 4)
      nByte |= 32;

    if(nColor & 8)
      nByte |= 2;
  }
  else
  {
    nByte &= 170;

    if(nColor & 1)
      nByte |= 64;

    if(nColor & 2)
      nByte |= 4;

    if(nColor & 4)
      nByte |= 16;

    if(nColor & 8)
      nByte |= 1;
  }

  return nByte;
}

unsigned char *GetLineAddress(unsigned char nLine)
{
  return (unsigned char *)0xC000 + ((nLine / 8) * 80) + ((nLine % 8) * 2048);
}

struct _tStar
{
  unsigned char nX;
  unsigned char nY;
  unsigned char nStarType;
  unsigned char *pLineAddress;
  unsigned char *pCurrentAddress;
};

#define STARS_NUM 80
struct _tStar aStars[STARS_NUM];

void main()
{
  unsigned char nStar = 0;
  struct _tStar *pStar = NULL;
  memset(aStars, 0, sizeof(aStars));
  
  //SCR_SET_MODE 0
  __asm
    ld a, #0
    call #0xBC0E
  __endasm;

  //PALETE
  __asm
    ld a, #0
    ld b, #0 ;black
    ld c, b
    call #0xBC32 ;SCR SET INK

    ld a, #1
    ld b, #12 ;Yellow
    ld c, b
    call #0xBC32 ;SCR SET INK

    ld a, #2
    ld b, #25 ;Pastel Yellow    
    ld c, b
    call #0xBC32 ;SCR SET INK

    ld a, #3
    ld b, #24 ;Bright Yellow
    ld c, b
    call #0xBC32 ;SCR SET INK
  __endasm;

  //SCR SET BORDER 0
  __asm
    ld b, #0 ;black
    ld c, b
    call #0xBC38
  __endasm;

  //Init
  for(nStar = 0; nStar < STARS_NUM; nStar++)
  {
    aStars[nStar].nX = rand() % 160;
    aStars[nStar].nY = rand() % 200;
    aStars[nStar].nStarType = rand() % 3;
    aStars[nStar].pLineAddress = GetLineAddress(aStars[nStar].nY);
    aStars[nStar].pCurrentAddress = aStars[nStar].pLineAddress;
  }

  while(1)
  {
    for(nStar = 0; nStar < STARS_NUM; nStar++)
    {
      pStar = &aStars[nStar];
      //delete star
      *pStar->pCurrentAddress = 0;

      //move star
      switch(pStar->nStarType)
      {
        case 0: //slow star
          pStar->nX += 1;
          break;
        case 1: //medium star
          pStar->nX += 2;
          break;
        case 2: //fast star
          pStar->nX += 3;
          break;
      }
      
      if(pStar->nX >= 160)
      {
        pStar->nX = 0;
        //pStar->nY = rand() % 200;
        //pStar->nStarType = rand() % 3;
        continue;
      }

      //paint star
      pStar->pCurrentAddress = pStar->pLineAddress + (pStar->nX / 2);
      *pStar->pCurrentAddress = GetMode0PixelColorByte(pStar->nStarType + 1, pStar->nX % 2);
    }
  }
}
////////////////////////////////////////////////////////////////////////

If we run in the emulator will get something like this (much more fluid in the emulator):

stars

As we have seen, the optimization is achieved by avoiding unnecessary calculations, or having them pre-calculated. As someone said "the less time-consuming calculation is one that is not calculated" :-)

You could download a zip with all files (source code, bat to compile, binary and dsk's) here: Measuring_times_and_optimizing_2D_Starfield.zip

 

 

www.CPCMania.com 2012