Groups | Search | Server Info | Keyboard shortcuts | Login | Register [http] [https] [nntp] [nntps]
Groups > comp.lang.forth > #7455 > unrolled thread
| Started by | "jacereda@gmail.com" <jacereda@gmail.com> |
|---|---|
| First post | 2011-11-24 14:29 -0800 |
| Last post | 2011-12-05 09:19 -0800 |
| Articles | 18 — 7 participants |
Back to article view | Back to comp.lang.forth
What would you call this threading method? "jacereda@gmail.com" <jacereda@gmail.com> - 2011-11-24 14:29 -0800
Re: What would you call this threading method? Alex McDonald <blog@rivadpm.com> - 2011-11-24 14:47 -0800
Re: What would you call this threading method? "jacereda@gmail.com" <jacereda@gmail.com> - 2011-11-27 12:50 -0800
Re: What would you call this threading method? "Rod Pemberton" <do_not_have@noavailemail.cmm> - 2011-11-27 17:47 -0500
Re: What would you call this threading method? Alex McDonald <blog@rivadpm.com> - 2011-11-27 15:27 -0800
Re: What would you call this threading method? mhx@iae.nl (Marcel Hendrix) - 2011-11-26 18:49 +0200
Re: What would you call this threading method? "jacereda@gmail.com" <jacereda@gmail.com> - 2011-11-27 12:46 -0800
Re: What would you call this threading method? Charles Childers <crc@rx-core.org> - 2011-11-29 17:05 -0800
Re: What would you call this threading method? Charles Childers <crc@rx-core.org> - 2011-11-29 17:06 -0800
Re: What would you call this threading method? Charles Childers <crc@rx-core.org> - 2011-11-29 17:05 -0800
Re: What would you call this threading method? Charles Childers <crc@rx-core.org> - 2011-11-29 17:06 -0800
Re: What would you call this threading method? mhx@iae.nl (Marcel Hendrix) - 2011-11-30 23:49 +0200
Re: What would you call this threading method? crc <charles.childers@gmail.com> - 2011-12-02 06:22 -0800
Re: What would you call this threading method? mhx@iae.nl (Marcel Hendrix) - 2011-12-03 11:52 +0200
Re: What would you call this threading method? crc <charles.childers@gmail.com> - 2011-12-07 10:51 -0800
Re: What would you call this threading method? mhx@iae.nl (Marcel Hendrix) - 2011-11-28 00:35 +0200
Re: What would you call this threading method? "jacereda@gmail.com" <jacereda@gmail.com> - 2011-11-27 16:33 -0800
Re: What would you call this threading method? Mat <dambere@web.de> - 2011-12-05 09:19 -0800
| From | "jacereda@gmail.com" <jacereda@gmail.com> |
|---|---|
| Date | 2011-11-24 14:29 -0800 |
| Subject | What would you call this threading method? |
| Message-ID | <d12ad4dd-361c-4eb7-8762-2e7fcc9ded92@b32g2000yqn.googlegroups.com> |
Hi,
I was looking at the default Ngaro C virtual machine (Retro) and
noticed the generated code to the switch-based dispatcher is quite
bad. Thinking about ways to speed it up, I came up with the code
below.
Since the interpreter is contained in a single routine, the compiler
can make a good job at allocating registers. The speed difference is
notable (bench/fib.rx with 1000 iterations instead of 100):
bash-3.2$ time ./retro64
real 0m8.158s
user 0m8.052s
sys 0m0.043s
bash-3.2$ time ./retroht64
real 0m1.547s
user 0m1.524s
sys 0m0.014s
I removed some checks, but the speedup was also very noticeable before
that.
The trick is making NEXT jump directly to an address (relative to the
NOP primitive) contained in a memory area ('shadow') that mirrors the
main dictionary. Initially, that area is all filled with the address
of the RESOLVE primitive. The RESOLVE primitive will then replace the
entry that resulted in a call to itself with the address corresponding
to the real opcode (sort of a poor man's JIT).
So, any idea on what would this threading method be?
Thanks,
Jorge
/* Ngaro VM
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Copyright (c) 2008 - 2011, Charles Childers
Copyright (c) 2009 - 2010, Luke Parrish
Copyright (c) 2010, Marc Simpson
Copyright (c) 2010, Jay Skeer
Copyright (c) 2011, Kenneth Keating
Copyright (c) 2011, Jorge Acereda
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
*/
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <unistd.h>
#include <stdarg.h>
#include <string.h>
#include <termios.h>
#include <sys/ioctl.h>
/* Configuration
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+---------+---------+---------+
| 16 bit | 32 bit | 64 bit |
+------------+---------+---------+---------+
| IMAGE_SIZE | 32000 | 1000000 | 1000000 |
+------------+---------+---------+---------+
| CELL | int16_t | int32_t | int64_t |
+------------+---------+---------+---------+
If memory is tight, cut the MAX_FILE_NAME and MAX_REQUEST_LENGTH.
You can also cut the ADDRESSES stack size down, but if you have
heavy nesting or recursion this may cause problems. If you do
modify
it and experience odd problems, try raising it a bit higher.
Use -DRX16 to select defaults for 16-bit, or -DRX64 to select the
defaults for 64-bit. Without these, the compiler will generate a
standard 32-bit VM.
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
*/
#define CELL int32_t
#define IMAGE_SIZE 1000000
#define ADDRESSES 1024
#define STACK_DEPTH 128
#define PORTS 16
#define MAX_FILE_NAME 1024
#define MAX_REQUEST_LENGTH 1024
#define MAX_OPEN_FILES 8
#define LOCAL "retroImage"
#ifdef RX64
#undef CELL
#undef LOCAL
#define CELL int64_t
#define LOCAL "retroImage64"
#endif
#ifdef RX16
#undef CELL
#undef LOCAL
#undef IMAGE_SIZE
#define CELL int16_t
#define IMAGE_SIZE 32000
#define LOCAL "retroImage16"
#endif
enum vm_opcode {VM_NOP, VM_LIT, VM_DUP, VM_DROP,
VM_SWAP, VM_PUSH, VM_POP, VM_LOOP,
VM_JUMP, VM_RETURN, VM_GT_JUMP, VM_LT_JUMP,
VM_NE_JUMP,VM_EQ_JUMP, VM_FETCH, VM_STORE,
VM_ADD, VM_SUB, VM_MUL, VM_DIVMOD,
VM_AND, VM_OR, VM_XOR, VM_SHL,
VM_SHR, VM_ZERO_EXIT, VM_INC, VM_DEC,
VM_IN, VM_OUT, VM_WAIT,
// These don't appear in the image
VM_RESOLVE, VM_COLON, VM_BYE };
typedef struct {
CELL address[ADDRESSES];
CELL data[STACK_DEPTH];
CELL image[IMAGE_SIZE];
CELL ports[PORTS];
FILE *files[MAX_OPEN_FILES];
FILE *input[MAX_OPEN_FILES];
char filename[MAX_FILE_NAME];
char request[MAX_REQUEST_LENGTH];
struct termios new_termios, old_termios;
CELL sp, rsp, ip, tos;
CELL isp;
CELL shrink;
uint16_t shadow[IMAGE_SIZE+1];
} VM;
static inline void dbg(const char * fmt, ...) {
if (0) {
va_list ap;
va_start(ap, fmt);
vprintf(fmt, ap);
printf("\n");
va_end(ap);
}
}
#define TOS vm->tos
#define DROP TOS = vm->data[vm->sp--]
/* Helper Functions
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */
static void rxGetString(VM *vm, int starting)
{
CELL i = 0;
while(vm->image[starting] && i < MAX_REQUEST_LENGTH)
vm->request[i++] = (char)vm->image[starting++];
vm->request[i] = 0;
}
/* Console I/O Support
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */
static void rxWriteConsole(VM *vm, CELL c) {
(c > 0) ? putchar((char)c) : printf("\033[2J\033[1;1H");
/* Erase the previous character if c = backspace */
if (c == 8) {
putchar(32);
putchar(8);
}
}
static CELL rxReadConsole(VM *vm) {
CELL c;
if ((c = getc(vm->input[vm->isp])) == EOF && vm->input[vm->isp] !=
stdin) {
fclose(vm->input[vm->isp--]);
c = 0;
}
if (c == EOF && vm->input[vm->isp] == stdin)
exit(0);
return c;
}
static void rxIncludeFile(VM *vm, char *s) {
FILE *file;
if ((file = fopen(s, "r")))
vm->input[++vm->isp] = file;
}
static void rxPrepareInput(VM *vm) {
vm->isp = 0;
vm->input[vm->isp] = stdin;
}
static void rxPrepareOutput(VM *vm) {
tcgetattr(0, &vm->old_termios);
vm->new_termios = vm->old_termios;
vm->new_termios.c_iflag &= ~(BRKINT+ISTRIP+IXON+IXOFF);
vm->new_termios.c_iflag |= (IGNBRK+IGNPAR);
vm->new_termios.c_lflag &= ~(ICANON+ISIG+IEXTEN+ECHO);
vm->new_termios.c_cc[VMIN] = 1;
vm->new_termios.c_cc[VTIME] = 0;
tcsetattr(0, TCSANOW, &vm->new_termios);
}
static void rxRestoreIO(VM *vm) {
tcsetattr(0, TCSANOW, &vm->old_termios);
}
/* File I/O Support
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */
static CELL rxGetFileHandle(VM *vm)
{
CELL i;
for(i = 1; i < MAX_OPEN_FILES; i++)
if (vm->files[i] == 0)
return i;
return 0;
}
static void rxAddInputSource(VM *vm) {
CELL name = TOS; DROP;
rxGetString(vm, name);
rxIncludeFile(vm, vm->request);
}
static CELL rxOpenFile(VM *vm) {
CELL slot, mode, name;
slot = rxGetFileHandle(vm);
mode = TOS; DROP;
name = TOS; DROP;
rxGetString(vm, name);
if (slot > 0)
{
if (mode == 0) vm->files[slot] = fopen(vm->request, "r");
if (mode == 1) vm->files[slot] = fopen(vm->request, "w");
if (mode == 2) vm->files[slot] = fopen(vm->request, "a");
if (mode == 3) vm->files[slot] = fopen(vm->request, "r+");
}
if (vm->files[slot] == NULL)
{
vm->files[slot] = 0;
slot = 0;
}
return slot;
}
static CELL rxReadFile(VM *vm) {
CELL c;
c = fgetc(vm->files[TOS]);
DROP;
return (c == EOF) ? 0 : c;
}
static CELL rxWriteFile(VM *vm) {
CELL slot, c, r;
slot = TOS; DROP;
c = TOS; DROP;
r = fputc(c, vm->files[slot]);
return (r == EOF) ? 0 : 1;
}
static CELL rxCloseFile(VM *vm) {
fclose(vm->files[TOS]);
vm->files[TOS] = 0;
DROP;
return 0;
}
static CELL rxGetFilePosition(VM *vm) {
CELL slot = TOS; DROP;
CELL r;
r = (CELL) ftell(vm->files[slot]);
return r;
}
static CELL rxSetFilePosition(VM *vm) {
CELL slot, pos, r;
slot = TOS; DROP;
pos = TOS; DROP;
r = fseek(vm->files[slot], pos, SEEK_SET);
return r;
}
static CELL rxGetFileSize(VM *vm) {
CELL slot, current, r, size;
slot = TOS; DROP;
current = ftell(vm->files[slot]);
r = fseek(vm->files[slot], 0, SEEK_END);
size = ftell(vm->files[slot]);
fseek(vm->files[slot], current, SEEK_SET);
return (r == 0) ? size : 0;
}
static CELL rxDeleteFile(VM *vm) {
CELL r;
CELL name = TOS; DROP;
rxGetString(vm, name);
r = (unlink(vm->request) == 0) ? -1 : 0;
return r;
}
static CELL rxLoadImage(VM *vm, const char *image) {
FILE *fp;
CELL x = 0;
if ((fp = fopen(image, "rb")) != NULL) {
x = fread(vm->image, sizeof(CELL), IMAGE_SIZE, fp);
fclose(fp);
}
return x;
}
static CELL rxSaveImage(VM *vm, char *image) {
FILE *fp;
CELL x = 0;
if ((fp = fopen(image, "wb")) == NULL)
{
printf("Unable to save the retroImage!\n");
rxRestoreIO(vm);
exit(2);
}
x = fwrite(&vm->image, sizeof(CELL),
vm->shrink? vm->image[3] : IMAGE_SIZE, fp);
fclose(fp);
return x;
}
/* Environment Query
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */
static void rxQueryEnvironment(VM *vm) {
CELL req, dest;
char *r;
req = TOS; DROP;
dest = TOS; DROP;
rxGetString(vm, req);
r = getenv(vm->request);
if (r != 0)
while (*r != '\0')
{
vm->image[dest] = *r;
dest++;
r++;
}
else
vm->image[dest] = 0;
}
/* Device I/O Handler
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */
static void rxDeviceHandler(VM *vm) {
struct winsize w;
if (vm->ports[0] != 1) {
/* Input */
if (vm->ports[0] == 0 && vm->ports[1] == 1) {
vm->ports[1] = rxReadConsole(vm);
vm->ports[0] = 1;
}
/* Output (character generator) */
if (vm->ports[2] == 1) {
rxWriteConsole(vm, TOS); DROP;
vm->ports[2] = 0;
vm->ports[0] = 1;
}
/* File IO and Image Saving */
if (vm->ports[4] != 0) {
vm->ports[0] = 1;
switch (vm->ports[4]) {
case 1: rxSaveImage(vm, vm->filename);
vm->ports[4] = 0;
break;
case 2: rxAddInputSource(vm);
vm->ports[4] = 0;
break;
case -1: vm->ports[4] = rxOpenFile(vm);
break;
case -2: vm->ports[4] = rxReadFile(vm);
break;
case -3: vm->ports[4] = rxWriteFile(vm);
break;
case -4: vm->ports[4] = rxCloseFile(vm);
break;
case -5: vm->ports[4] = rxGetFilePosition(vm);
break;
case -6: vm->ports[4] = rxSetFilePosition(vm);
break;
case -7: vm->ports[4] = rxGetFileSize(vm);
break;
case -8: vm->ports[4] = rxDeleteFile(vm);
break;
default: vm->ports[4] = 0;
}
}
/* Capabilities */
if (vm->ports[5] != 0) {
vm->ports[0] = 1;
switch(vm->ports[5]) {
case -1: vm->ports[5] = IMAGE_SIZE;
break;
case -2: vm->ports[5] = 0;
break;
case -3: vm->ports[5] = 0;
break;
case -4: vm->ports[5] = 0;
break;
case -5: vm->ports[5] = vm->sp;
break;
case -6: vm->ports[5] = vm->rsp;
break;
case -7: vm->ports[5] = 0;
break;
case -8: vm->ports[5] = time(NULL);
break;
case -9: vm->ports[5] = 0;
vm->ip = IMAGE_SIZE;
break;
case -10: vm->ports[5] = 0;
rxQueryEnvironment(vm);
break;
case -11: ioctl(0, TIOCGWINSZ, &w);
vm->ports[5] = w.ws_col;
break;
case -12: ioctl(0, TIOCGWINSZ, &w);
vm->ports[5] = w.ws_row;
break;
default: vm->ports[5] = 0;
}
}
}
}
#undef TOS
#undef DROP
#define IP ip
#define SP sp
#define RSP rsp
#define TOS tos
#define DROP TOS = vm->data[SP--]
#define DUP do { SP++; NOS=TOS; } while (0)
#define NOS vm->data[SP]
#define TORS vm->address[RSP]
#define SKIPNOPS \
do if (vm->image[IP]==0) { \
IP++; \
if (vm->image[IP] == 0) \
IP++; \
} while (0)
/* The VM
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */
intptr_t rxTick(VM * vm) {
register CELL a;
register CELL b;
register CELL sp;
register CELL rsp;
register CELL ip;
register CELL tos;
const char * const opcodes[] = {
&&label_NOP, &&label_LIT, &&label_DUP, &&label_DROP,
&&label_SWAP, &&label_PUSH, &&label_POP, &&label_LOOP,
&&label_JUMP, &&label_RETURN, &&label_GT_JUMP, &&label_LT_JUMP,
&&label_NE_JUMP,&&label_EQ_JUMP, &&label_FETCH, &&label_STORE,
&&label_ADD, &&label_SUB, &&label_MUL, &&label_DIVMOD,
&&label_AND, &&label_OR, &&label_XOR, &&label_SHL,
&&label_SHR, &&label_ZERO_EXIT, &&label_INC, &&label_DEC,
&&label_IN, &&label_OUT, &&label_WAIT,
&&label_RESOLVE, &&label_COLON, &&label_BYE};
if (vm == (void*)0)
return opcodes[VM_RESOLVE] - opcodes[VM_NOP];
if (vm == (void*)1)
return opcodes[VM_BYE] - opcodes[VM_NOP];
#define PRIM(x) label_##x: asm("# " #x "\n")
#define NEXT goto *(opcodes[VM_NOP] + vm->shadow[IP++])
#define SAVEREGS vm->sp = sp; vm->rsp = rsp; vm->ip = ip; vm->tos =
tos
#define RESTREGS sp = vm->sp; rsp = vm->rsp; ip = vm->ip; tos = vm-
>tos
RESTREGS;
for (;;) {
NEXT;
PRIM(NOP);
NEXT;
PRIM(LIT);
DUP;
TOS = vm->image[IP++];
NEXT;
PRIM(DUP);
DUP;
NEXT;
PRIM(DROP);
DROP;
NEXT;
PRIM(SWAP);
a = TOS;
TOS = NOS;
NOS = a;
NEXT;
PRIM(PUSH);
RSP++;
TORS = TOS;
DROP;
NEXT;
PRIM(POP);
DUP;
TOS = TORS;
RSP--;
NEXT;
PRIM(LOOP);
if (--TOS > 0)
IP = vm->image[IP];
else {
IP++;
DROP;
}
NEXT;
PRIM(JUMP);
IP = vm->image[IP];
SKIPNOPS;
NEXT;
PRIM(RETURN);
IP = TORS;
RSP--;
IP++;
SKIPNOPS;
NEXT;
PRIM(GT_JUMP);
if(NOS > TOS)
IP = vm->image[IP];
else
IP++;
DROP; DROP;
NEXT;
PRIM(LT_JUMP);
if(NOS < TOS)
IP = vm->image[IP];
else
IP++;
DROP; DROP;
NEXT;
PRIM(NE_JUMP);
if(TOS != NOS)
IP = vm->image[IP];
else
IP++;
DROP; DROP;
NEXT;
PRIM(EQ_JUMP);
if(TOS == NOS)
IP = vm->image[IP];
else
IP++;
DROP; DROP;
NEXT;
PRIM(FETCH);
TOS = vm->image[TOS];
NEXT;
PRIM(STORE);
vm->image[TOS] = NOS;
DROP; DROP;
NEXT;
PRIM(ADD);
NOS += TOS;
DROP;
NEXT;
PRIM(SUB);
NOS -= TOS;
DROP;
NEXT;
PRIM(MUL);
NOS *= TOS;
DROP;
NEXT;
PRIM(DIVMOD);
a = TOS;
b = NOS;
TOS = b / a;
NOS = b % a;
NEXT;
PRIM(AND);
a = TOS;
b = NOS;
DROP;
TOS = a & b;
NEXT;
PRIM(OR);
a = TOS;
b = NOS;
DROP;
TOS = a | b;
NEXT;
PRIM(XOR);
a = TOS;
b = NOS;
DROP;
TOS = a ^ b;
NEXT;
PRIM(SHL);
a = TOS;
b = NOS;
DROP;
TOS = b << a;
NEXT;
PRIM(SHR);
a = TOS;
DROP;
TOS >>= a;
NEXT;
PRIM(ZERO_EXIT);
if (TOS == 0) {
DROP;
IP = TORS + 1;
RSP--;
}
NEXT;
PRIM(INC);
TOS += 1;
NEXT;
PRIM(DEC);
TOS -= 1;
NEXT;
PRIM(IN);
a = TOS;
TOS = vm->ports[a];
vm->ports[a] = 0;
NEXT;
PRIM(OUT);
vm->ports[0] = 0;
vm->ports[TOS] = NOS;
DROP; DROP;
NEXT;
PRIM(WAIT);
SAVEREGS;
rxDeviceHandler(vm);
RESTREGS;
NEXT;
PRIM(COLON);
RSP++;
a = IP-1;
TORS = a;
IP = vm->image[a];
SKIPNOPS;
NEXT;
PRIM(RESOLVE);
IP--;
a = vm->image[IP];
vm->shadow[IP] = opcodes[a < VM_RESOLVE? a : VM_COLON] -
opcodes[VM_NOP];
NEXT;
PRIM(BYE);
SAVEREGS;
return 0;
NEXT;
}
}
VM * rxNew() {
return calloc(1, sizeof(VM));
}
void rxDel(VM * vm) {
free(vm);
}
int rxInit(VM * vm, const char * img, int shrink) {
CELL offset;
unsigned i;
offset = rxTick((VM*)0);
for (i = 0; i < IMAGE_SIZE; i++)
vm->shadow[i] = offset;
vm->shadow[IMAGE_SIZE] = rxTick((VM*)1);
vm->ip = 1;
vm->shrink = shrink;
strcpy(vm->filename, img);
return rxLoadImage(vm, img);
}
void rxTerm(VM * vm) {
#ifdef NOTYET
unsigned i;
for (i = 0; i < MAX_OPEN_FILES; i++) {
if (vm->files[i])
fclose(vm->files[i]);
if (vm->input[i])
fclose(vm->input[i]);
}
#endif
}
/* Main
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */
int main(int argc, char **argv) {
unsigned i;
VM *vm;
const char * img = LOCAL;
int shrink = 0;
for (i = 1; i < argc; i++) {
if (!strcmp(argv[i], "--image"))
img = argv[++i];
if (!strcmp(argv[i], "--shrink"))
shrink = 1;
}
vm = rxNew();
if (rxInit(vm, img, shrink)) {
rxPrepareInput(vm);
rxPrepareOutput(vm);
for (i = 1; i < argc; i++) {
if (!strcmp(argv[i], "--with"))
rxIncludeFile(vm, argv[++i]);
}
rxTick(vm);
}
rxTerm(vm);
rxDel(vm);
rxRestoreIO(vm);
return 0;
}
[toc] | [next] | [standalone]
| From | Alex McDonald <blog@rivadpm.com> |
|---|---|
| Date | 2011-11-24 14:47 -0800 |
| Message-ID | <e2d31c77-9d47-46b0-b839-b2ea47431656@u6g2000vbg.googlegroups.com> |
| In reply to | #7455 |
On Nov 24, 10:29 pm, "jacer...@gmail.com" <jacer...@gmail.com> wrote:
> Hi,
>
> I was looking at the default Ngaro C virtual machine (Retro) and
> noticed the generated code to the switch-based dispatcher is quite
> bad. Thinking about ways to speed it up, I came up with the code
> below.
>
> Since the interpreter is contained in a single routine, the compiler
> can make a good job at allocating registers. The speed difference is
> notable (bench/fib.rx with 1000 iterations instead of 100):
>
> bash-3.2$ time ./retro64
>
> real 0m8.158s
> user 0m8.052s
> sys 0m0.043s
> bash-3.2$ time ./retroht64
>
> real 0m1.547s
> user 0m1.524s
> sys 0m0.014s
>
> I removed some checks, but the speedup was also very noticeable before
> that.
>
> The trick is making NEXT jump directly to an address (relative to the
> NOP primitive) contained in a memory area ('shadow') that mirrors the
> main dictionary. Initially, that area is all filled with the address
> of the RESOLVE primitive. The RESOLVE primitive will then replace the
> entry that resulted in a call to itself with the address corresponding
> to the real opcode (sort of a poor man's JIT).
>
> So, any idea on what would this threading method be?
Yes; it's dynamic loading as used by OS designers when dealing with
libraries. Win32Forth uses this mechanism to call external "words",
that is, Windows entry points in DLLs. The same effect can be achieved
with DEFER.
> Thanks,
> Jorge
>
[snip]
[toc] | [prev] | [next] | [standalone]
| From | "jacereda@gmail.com" <jacereda@gmail.com> |
|---|---|
| Date | 2011-11-27 12:50 -0800 |
| Message-ID | <0db58761-c05e-4f92-b509-5b31292e3d00@4g2000yqu.googlegroups.com> |
| In reply to | #7456 |
On Nov 24, 11:47 pm, Alex McDonald <b...@rivadpm.com> wrote:
> On Nov 24, 10:29 pm, "jacer...@gmail.com" <jacer...@gmail.com> wrote:
> > The trick is making NEXT jump directly to an address (relative to the
> > NOP primitive) contained in a memory area ('shadow') that mirrors the
> > main dictionary. Initially, that area is all filled with the address
> > of the RESOLVE primitive. The RESOLVE primitive will then replace the
> > entry that resulted in a call to itself with the address corresponding
> > to the real opcode (sort of a poor man's JIT).
>
> > So, any idea on what would this threading method be?
>
> Yes; it's dynamic loading as used by OS designers when dealing with
> libraries. Win32Forth uses this mechanism to call external "words",
> that is, Windows entry points in DLLs. The same effect can be achieved
> with DEFER.
Hmm... The VM is unconventional in the sense that cells containing
opcodes <32 are primitives and anything above that is a call to a
colon definition.
I would assume this to be token-threading, but after adding this
'dynamic resolving' or 'dynamic loading' or whatever it's called, it
behaves more like a direct-threaded interpreter.
[toc] | [prev] | [next] | [standalone]
| From | "Rod Pemberton" <do_not_have@noavailemail.cmm> |
|---|---|
| Date | 2011-11-27 17:47 -0500 |
| Message-ID | <jauej9$2na$1@speranza.aioe.org> |
| In reply to | #7543 |
<jacereda@gmail.com> wrote in message news:0db58761-c05e-4f92-b509-5b31292e3d00@4g2000yqu.googlegroups.com... > > The VM is unconventional in the sense that cells containing > opcodes <32 are primitives and anything above that is a call > to a colon definition. I did something similar a while back for a non-Forth, but Forth-like interpreter. Feel free to use this idea if you want. Non-zero values were absolute addresses for compiled words. Most Forth words are high-level, or compiled. Therefore, most Forth words have a CFA (code field address) that points to NEXT or DOCOL. So, what is the point of having a CFA? Only a small handful of words, i.e., low-level or primitive words, need a CFA. Instead of having a code field with an address as to what routine is to be executed for each word, I eliminated the code field. By default, all words are compiled words, and therefore interpreted. However, low-level, or primitive, words must be executed. How do you detect them? I used zero (0) to act as a trap value that halted the address interpreter and indicated the next word is a primitive or low-level word. You can't call NULL anyway. It's an invalid address for most systems. Once halted, the value at the location immediately after the zero was retrieved. The post zero value indicated which primitive to execute. I.e., high level words are absolute addresses, and low-level words are a pair: zero and value for the primitive. This means I only need a single check for non-zero values in the address interpreter's loop and a small switch to execute the primitives. > I would assume this to be token-threading, but after adding this > 'dynamic resolving' or 'dynamic loading' or whatever it's called, > it behaves more like a direct-threaded interpreter. TTC (token threaded code) uses values which indicate what "instruction" they are to perform, but are not absolute addresses. Bytecode is the main example of this. ITC (indirect threaded code) and DTC (direct threaded code) both use lists of addresses. I'd say both yours and mine are a hybrid. Your opcodes are like TTC, while the higher addresses are either DTC or ITC depending on what you did. Brad Rodriquez's Moving Forth has a decent intro to DTC, ITC, TTC, and STC: http://www.bradrodriguez.com/papers/moving1.htm Rod Pemberton
[toc] | [prev] | [next] | [standalone]
| From | Alex McDonald <blog@rivadpm.com> |
|---|---|
| Date | 2011-11-27 15:27 -0800 |
| Message-ID | <90dd0226-675f-4fe5-bade-6e8401c9c34c@q30g2000yqj.googlegroups.com> |
| In reply to | #7543 |
On Nov 27, 8:50 pm, "jacer...@gmail.com" <jacer...@gmail.com> wrote:
> On Nov 24, 11:47 pm, Alex McDonald <b...@rivadpm.com> wrote:
>
> > On Nov 24, 10:29 pm, "jacer...@gmail.com" <jacer...@gmail.com> wrote:
> > > The trick is making NEXT jump directly to an address (relative to the
> > > NOP primitive) contained in a memory area ('shadow') that mirrors the
> > > main dictionary. Initially, that area is all filled with the address
> > > of the RESOLVE primitive. The RESOLVE primitive will then replace the
> > > entry that resulted in a call to itself with the address corresponding
> > > to the real opcode (sort of a poor man's JIT).
>
> > > So, any idea on what would this threading method be?
>
> > Yes; it's dynamic loading as used by OS designers when dealing with
> > libraries. Win32Forth uses this mechanism to call external "words",
> > that is, Windows entry points in DLLs. The same effect can be achieved
> > with DEFER.
>
> Hmm... The VM is unconventional in the sense that cells containing
> opcodes <32 are primitives and anything above that is a call to a
> colon definition.
> I would assume this to be token-threading, but after adding this
> 'dynamic resolving' or 'dynamic loading' or whatever it's called, it
> behaves more like a direct-threaded interpreter.
There are extensions of this technique; see
http://www.complang.tuwien.ac.at/papers/ertl&pirker97.ps.gz "The
Structure of a Forth Native Code Compiler, [ertl&pirker98] M. Anton
Ertl and Christian Pirker" which discusses compile on EXECUTE.
[toc] | [prev] | [next] | [standalone]
| From | mhx@iae.nl (Marcel Hendrix) |
|---|---|
| Date | 2011-11-26 18:49 +0200 |
| Message-ID | <19861709928436@frunobulax.edu> |
| In reply to | #7455 |
"jacereda@gmail.com" <jacereda@gmail.com> writes Re: What would you call this threading method? [..] > Since the interpreter is contained in a single routine, the compiler > can make a good job at allocating registers. The speed difference is > notable (bench/fib.rx with 1000 iterations instead of 100): [..] When compiled with cygwin (32-bit) and run under Windows 7, it works (I downloaded the default image file from the main site, guessing that would be somehow necessary). However, I always get a segfault when I do bye? -marcel
[toc] | [prev] | [next] | [standalone]
| From | "jacereda@gmail.com" <jacereda@gmail.com> |
|---|---|
| Date | 2011-11-27 12:46 -0800 |
| Message-ID | <3ae77a59-84c3-4cd3-9c58-8c62ec5c8c0a@q30g2000yqj.googlegroups.com> |
| In reply to | #7499 |
On Nov 26, 5:49 pm, m...@iae.nl (Marcel Hendrix) wrote: > "jacer...@gmail.com" <jacer...@gmail.com> writes Re: What would you call this threading method? > > [..]> Since the interpreter is contained in a single routine, the compiler > > can make a good job at allocating registers. The speed difference is > > notable (bench/fib.rx with 1000 iterations instead of 100): > > [..] > > When compiled with cygwin (32-bit) and run under Windows 7, it works > (I downloaded the default image file from the main site, guessing that > would be somehow necessary). However, I always get a segfault when > I do bye? Could it have something to do with rxRestoreIO()? What if you comment rxPrepareOutput() and rxRestoreIO()? I just cross-compiled a mingw version leaving those out and it runs fine.
[toc] | [prev] | [next] | [standalone]
| From | Charles Childers <crc@rx-core.org> |
|---|---|
| Date | 2011-11-29 17:05 -0800 |
| Message-ID | <d81000ed-c796-4412-ac51-4ae67b22b6bc@m7g2000vbc.googlegroups.com> |
| In reply to | #7499 |
On Nov 26, 11:49 am, m...@iae.nl (Marcel Hendrix) wrote: > "jacer...@gmail.com" <jacer...@gmail.com> writes Re: What would you call this threading method? > > [..]> Since the interpreter is contained in a single routine, the compiler > > can make a good job at allocating registers. The speed difference is > > notable (bench/fib.rx with 1000 iterations instead of 100): > > [..] > > When compiled with cygwin (32-bit) and run under Windows 7, it works > (I downloaded the default image file from the main site, guessing that > would be somehow necessary). However, I always get a segfault when > I do bye? > > -marcel At the end of the main function, switch the positions of the calls to rxDel and rxRestoreIO. This will fix the bug. -- crc
[toc] | [prev] | [next] | [standalone]
| From | Charles Childers <crc@rx-core.org> |
|---|---|
| Date | 2011-11-29 17:06 -0800 |
| Message-ID | <deb623b8-6364-4db0-adf5-2aad0d9071ef@x7g2000yqb.googlegroups.com> |
| In reply to | #7499 |
On Nov 26, 11:49 am, m...@iae.nl (Marcel Hendrix) wrote: > "jacer...@gmail.com" <jacer...@gmail.com> writes Re: What would you call this threading method? > > [..]> Since the interpreter is contained in a single routine, the compiler > > can make a good job at allocating registers. The speed difference is > > notable (bench/fib.rx with 1000 iterations instead of 100): > > [..] > > When compiled with cygwin (32-bit) and run under Windows 7, it works > (I downloaded the default image file from the main site, guessing that > would be somehow necessary). However, I always get a segfault when > I do bye? > > -marcel At the end of the main function, switch the positions of the calls to rxDel and rxRestoreIO. This will fix the bug. -- crc
[toc] | [prev] | [next] | [standalone]
| From | Charles Childers <crc@rx-core.org> |
|---|---|
| Date | 2011-11-29 17:05 -0800 |
| Message-ID | <288c25fc-4fcf-4eb1-b214-d9e00b9ea8bc@l24g2000yqm.googlegroups.com> |
| In reply to | #7499 |
On Nov 26, 11:49 am, m...@iae.nl (Marcel Hendrix) wrote: > "jacer...@gmail.com" <jacer...@gmail.com> writes Re: What would you call this threading method? > > [..]> Since the interpreter is contained in a single routine, the compiler > > can make a good job at allocating registers. The speed difference is > > notable (bench/fib.rx with 1000 iterations instead of 100): > > [..] > > When compiled with cygwin (32-bit) and run under Windows 7, it works > (I downloaded the default image file from the main site, guessing that > would be somehow necessary). However, I always get a segfault when > I do bye? > > -marcel At the end of the main function, switch the positions of the calls to rxDel and rxRestoreIO. This will fix the bug. -- crc
[toc] | [prev] | [next] | [standalone]
| From | Charles Childers <crc@rx-core.org> |
|---|---|
| Date | 2011-11-29 17:06 -0800 |
| Message-ID | <541f26b7-e884-4890-9e10-60889a8d1815@q16g2000yqn.googlegroups.com> |
| In reply to | #7499 |
On Nov 26, 11:49 am, m...@iae.nl (Marcel Hendrix) wrote: > "jacer...@gmail.com" <jacer...@gmail.com> writes Re: What would you call this threading method? > > [..]> Since the interpreter is contained in a single routine, the compiler > > can make a good job at allocating registers. The speed difference is > > notable (bench/fib.rx with 1000 iterations instead of 100): > > [..] > > When compiled with cygwin (32-bit) and run under Windows 7, it works > (I downloaded the default image file from the main site, guessing that > would be somehow necessary). However, I always get a segfault when > I do bye? > > -marcel At the end of the main function, switch the positions of the calls to rxDel and rxRestoreIO. This will fix the bug. -- crc
[toc] | [prev] | [next] | [standalone]
| From | mhx@iae.nl (Marcel Hendrix) |
|---|---|
| Date | 2011-11-30 23:49 +0200 |
| Message-ID | <18861205928436@frunobulax.edu> |
| In reply to | #7626 |
Charles Childers <crc@rx-core.org> writes Re: What would you call this threading method? > On Nov 26, 11:49=A0am, m...@iae.nl (Marcel Hendrix) wrote: >> "jacer...@gmail.com" <jacer...@gmail.com> writes Re: What would you call this threading method? >> [..]> Since the interpreter is contained in a single routine, the compiler >> > can make a good job at allocating registers. The speed difference is >> > notable (bench/fib.rx with 1000 iterations instead of 100): [..] >> When compiled with cygwin (32-bit) and run under Windows 7, it works >> (I downloaded the default image file from the main site, guessing that >> would be somehow necessary). However, I always get a segfault when >> I do bye? >> -marcel > At the end of the main function, switch the positions of the calls to > rxDel and rxRestoreIO. This will fix the bug. OK. Now I have your attention ... Somewhat to my surprise I found that cygwin 32bit has no problem in building a runnable 64-bit ngaro interpreter! Of course, in this case the host is still 32-bits. Is this the intended behavior? (the mingw 64bit tools that I have can only build windows targets and do not seem to understand the assembler syntax in ngaro.c). I found that in past few days it has become quite a lot more difficult to find convert.c and source files for ngaro. -marcel
[toc] | [prev] | [next] | [standalone]
| From | crc <charles.childers@gmail.com> |
|---|---|
| Date | 2011-12-02 06:22 -0800 |
| Message-ID | <3a7a57e7-72a6-4e2e-a108-24a25f1c808e@o5g2000yqa.googlegroups.com> |
| In reply to | #7656 |
On Nov 30, 4:49 pm, m...@iae.nl (Marcel Hendrix) wrote: > Charles Childers <c...@rx-core.org> writes Re: What would you call this threading method? > > > On Nov 26, 11:49=A0am, m...@iae.nl (Marcel Hendrix) wrote: > >> "jacer...@gmail.com" <jacer...@gmail.com> writes Re: What would you call this threading method? > >> [..]> Since the interpreter is contained in a single routine, the compiler > >> > can make a good job at allocating registers. The speed difference is > >> > notable (bench/fib.rx with 1000 iterations instead of 100): > > [..] > > >> When compiled with cygwin (32-bit) and run under Windows 7, it works > >> (I downloaded the default image file from the main site, guessing that > >> would be somehow necessary). However, I always get a segfault when > >> I do bye? > >> -marcel > > At the end of the main function, switch the positions of the calls to > > rxDel and rxRestoreIO. This will fix the bug. > > OK. > > Now I have your attention ... > > Somewhat to my surprise I found that cygwin 32bit has no problem in building a > runnable 64-bit ngaro interpreter! Of course, in this case the host is still > 32-bits. Is this the intended behavior? (the mingw 64bit tools that I have can > only build windows targets and do not seem to understand the assembler syntax > in ngaro.c). There's nothing in the source to prevent building and using a 64-bit image and ngaro on 32-bit hardware. It'll be slower, but works. (Likewise, with the 16-bit option) Regarding the assembler syntax, I've not used inline assembly in C in many years, so can't help there. I have built it with a modified PRIM macro: #define PRIM(x) label_##x: But only on 32-bit systems and compilers. There's a performance loss with this, though it's still faster than my implementation. > > I found that in past few days it has become quite a lot more difficult to find > convert.c and source files for ngaro. How so? -- crc
[toc] | [prev] | [next] | [standalone]
| From | mhx@iae.nl (Marcel Hendrix) |
|---|---|
| Date | 2011-12-03 11:52 +0200 |
| Message-ID | <58839200918436@frunobulax.edu> |
| In reply to | #7626 |
crc <charles.childers@gmail.com> writes Re: What would you call this threading method? > On Nov 30, 4:49 pm, m...@iae.nl (Marcel Hendrix) wrote: >> Charles Childers <c...@rx-core.org> writes Re: What would you call this threading method? [..] >> Somewhat to my surprise I found that cygwin 32bit has no problem in building a >> runnable 64-bit ngaro interpreter! Of course, in this case the host is still >> 32-bits. Is this the intended behavior? (the mingw 64bit tools that I have can >> only build windows targets and do not seem to understand the assembler syntax >> in ngaro.c). > There's nothing in the source to prevent building and using a 64-bit > image and ngaro on 32-bit hardware. It'll be slower, but works. > (Likewise, with the 16-bit option) OK, great. > Regarding the assembler syntax, I've not used inline assembly in C in > many years, so can't help there. I have built it with a modified PRIM > macro: > #define PRIM(x) label_##x: Sorry about that, I did not realize I was working with a hacked copy of the source. I have now downloaded the original ngaro.c, but it is using the termios stuff that mingw does not understand. When removing that it works in 32-bit mode but still no luck in 64-bit mode. (I guess there is a problem with int etc.) [..] >> I found that in past few days it has become quite a lot more difficult to find >> convert.c and source files for ngaro. > How so? Try looking for 'retro' using Google :-) The best entry seems to be through the retro blog. It proved far easier to download the gForth vm for retro and port it to iForth64. Even without the messy literal ` immediate stuff it runs fine. I added the missing getEnv and cw ch underpinning and 'fixed' the output of ^J and the input of '\' on Windows. The result for the 'fib' benchmark puzzles me. -marcel ------------------------------------------------------------------------------------ [1] iForth x64 server 1.03 (console), Nov 4 2011, 21:17:27. [2] Stuffed iForth at $00FF4440 [entry: $01000000] [3] Having a Windows terminal. [4] Console is active. [5] Sound devices are internal. [6] Executing include/iforth.prf Creating --- LOCATE utility Version 1.10 --- Creating --- Several utilities Version 3.12 --- Creating --- Extended OS words Version 3.17 --- Creating --- Terminal Driver Version 3.14 --- Creating --- Command line Editor Version 1.30 --- Creating --- Online help Version 1.36 --- Creating --- Glossary Generator Version 1.05 --- Creating --- Disassembler Version 2.40 --- iForth version 4.0.753, generated 13:01:18, October 8, 2011. x86_64 binary, native floating-point, extended precision. Copyright 1996 - 2011 Marcel Hendrix. FORTH> in ngaro ok ngaro> retro Retro 11.1 (1321716294) ok -1 ok 1 ok >> ok putn 9223372036854775807 ok "HOME" ok getEnv ok puts C:/Users/marcel ok bye ok ngaro> Benchmark results on Intel i7 920, 2.67 GHz CPU, 64bit code, iForth VM: benchmark Best reported iForth64 -------------------------------------- factorial 0.46 s C# 0.46 s fib 0.25 s C 1.10 s loops 2.18 s Java 1.99 s
[toc] | [prev] | [next] | [standalone]
| From | crc <charles.childers@gmail.com> |
|---|---|
| Date | 2011-12-07 10:51 -0800 |
| Message-ID | <abd10249-6362-45df-a248-79552dc5243e@c13g2000vbh.googlegroups.com> |
| In reply to | #7708 |
On Dec 3, 4:52 am, m...@iae.nl (Marcel Hendrix) wrote: > crc <charles.child...@gmail.com> writes Re: What would you call this threading method? > > > > > On Nov 30, 4:49 pm, m...@iae.nl (Marcel Hendrix) wrote: > >> Charles Childers <c...@rx-core.org> writes Re: What would you call this threading method? > [..] > >> Somewhat to my surprise I found that cygwin 32bit has no problem in building a > >> runnable 64-bit ngaro interpreter! Of course, in this case the host is still > >> 32-bits. Is this the intended behavior? (the mingw 64bit tools that I have can > >> only build windows targets and do not seem to understand the assembler syntax > >> in ngaro.c). > > There's nothing in the source to prevent building and using a 64-bit > > image and ngaro on 32-bit hardware. It'll be slower, but works. > > (Likewise, with the 16-bit option) > > OK, great. > > > Regarding the assembler syntax, I've not used inline assembly in C in > > many years, so can't help there. I have built it with a modified PRIM > > macro: > > #define PRIM(x) label_##x: > > Sorry about that, I did not realize I was working with a hacked copy of > the source. > > I have now downloaded the original ngaro.c, but it is using the termios stuff > that mingw does not understand. When removing that it works in 32-bit mode > but still no luck in 64-bit mode. (I guess there is a problem with int etc.) > > [..] > > >> I found that in past few days it has become quite a lot more difficult to find > >> convert.c and source files for ngaro. > > How so? > > Try looking for 'retro' using Google :-) > The best entry seems to be through the retro blog. A search for 'retro forth' or 'retro language' should work better. I admit that the name makes it annoying to search for... > It proved far easier to download the gForth vm for retro and port it to iForth64. > Even without the messy literal ` immediate stuff it runs fine. I added > the missing getEnv and cw ch underpinning and 'fixed' the output of ^J and > the input of '\' on Windows. Glad to hear. > The result for the 'fib' benchmark puzzles me. This should now be fixed in the repository; thanks! > -marcel > > --------------------------------------------------------------------------- --------- > > [1] iForth x64 server 1.03 (console), Nov 4 2011, 21:17:27. > [2] Stuffed iForth at $00FF4440 [entry: $01000000] > [3] Having a Windows terminal. > [4] Console is active. > [5] Sound devices are internal. > [6] Executing include/iforth.prf > > Creating --- LOCATE utility Version 1.10 --- > Creating --- Several utilities Version 3.12 --- > Creating --- Extended OS words Version 3.17 --- > Creating --- Terminal Driver Version 3.14 --- > Creating --- Command line Editor Version 1.30 --- > Creating --- Online help Version 1.36 --- > Creating --- Glossary Generator Version 1.05 --- > Creating --- Disassembler Version 2.40 --- > > iForth version 4.0.753, generated 13:01:18, October 8, 2011. > x86_64 binary, native floating-point, extended precision. > Copyright 1996 - 2011 Marcel Hendrix. > > FORTH> in ngaro ok > ngaro> retro Retro 11.1 (1321716294) > > ok -1 > ok 1 > ok >> > ok putn 9223372036854775807 > ok "HOME" > ok getEnv > ok puts C:/Users/marcel > ok bye > ok > ngaro> > > Benchmark results on Intel i7 920, 2.67 GHz CPU, 64bit code, iForth VM: > > benchmark Best reported iForth64 > -------------------------------------- > factorial 0.46 s C# 0.46 s > fib 0.25 s C 1.10 s > loops 2.18 s Java 1.99 s -- crc
[toc] | [prev] | [next] | [standalone]
| From | mhx@iae.nl (Marcel Hendrix) |
|---|---|
| Date | 2011-11-28 00:35 +0200 |
| Message-ID | <87680307928436@frunobulax.edu> |
| In reply to | #7455 |
"jacereda@gmail.com" <jacereda@gmail.com> writes Re: What would you call this threading method? > On Nov 26, 5:49=A0pm, m...@iae.nl (Marcel Hendrix) wrote: >> "jacer...@gmail.com" <jacer...@gmail.com> writes Re: What would you call this threading method? [..] > Could it have something to do with rxRestoreIO()? What if you comment > rxPrepareOutput() and rxRestoreIO()? > I just cross-compiled a mingw version leaving those out and it runs > fine. That fixes it for me too. A 64-bit version is not that easy, though. -marcel
[toc] | [prev] | [next] | [standalone]
| From | "jacereda@gmail.com" <jacereda@gmail.com> |
|---|---|
| Date | 2011-11-27 16:33 -0800 |
| Message-ID | <01ac81d7-7b71-4660-968e-7691f0cb54f3@o5g2000yqa.googlegroups.com> |
| In reply to | #7554 |
On Nov 27, 11:35 pm, m...@iae.nl (Marcel Hendrix) wrote: > "jacer...@gmail.com" <jacer...@gmail.com> writes Re: What would you call this threading method? > > > On Nov 26, 5:49=A0pm, m...@iae.nl (Marcel Hendrix) wrote: > >> "jacer...@gmail.com" <jacer...@gmail.com> writes Re: What would you call this threading method? > > [..] > > > Could it have something to do with rxRestoreIO()? What if you comment > > rxPrepareOutput() and rxRestoreIO()? > > I just cross-compiled a mingw version leaving those out and it runs > > fine. > > That fixes it for me too. A 64-bit version is not that easy, though. > > -marcel Just define RX64 and use the convert.c utility from the retro repository to turn the image into a 64 bit image.
[toc] | [prev] | [next] | [standalone]
| From | Mat <dambere@web.de> |
|---|---|
| Date | 2011-12-05 09:19 -0800 |
| Message-ID | <1045c56c-e924-4ce3-9fe3-77b113bd64b5@v29g2000yqv.googlegroups.com> |
| In reply to | #7455 |
On 24 Nov., 23:29, "jacer...@gmail.com" <jacer...@gmail.com> wrote:
> Hi,
>
> I was looking at the default Ngaro C virtual machine (Retro) and
> noticed the generated code to the switch-based dispatcher is quite
> bad. Thinking about ways to speed it up, I came up with the code
> below.
>
> Since the interpreter is contained in a single routine, the compiler
> can make a good job at allocating registers. The speed difference is
> notable (bench/fib.rx with 1000 iterations instead of 100):
>
> bash-3.2$ time ./retro64
>
> real 0m8.158s
> user 0m8.052s
> sys 0m0.043s
> bash-3.2$ time ./retroht64
>
> real 0m1.547s
> user 0m1.524s
> sys 0m0.014s
>
> I removed some checks, but the speedup was also very noticeable before
> that.
>
> The trick is making NEXT jump directly to an address (relative to the
> NOP primitive) contained in a memory area ('shadow') that mirrors the
> main dictionary. Initially, that area is all filled with the address
> of the RESOLVE primitive. The RESOLVE primitive will then replace the
> entry that resulted in a call to itself with the address corresponding
> to the real opcode (sort of a poor man's JIT).
>
> So, any idea on what would this threading method be?
> Thanks,
> Jorge
>
> /* Ngaro VM
> ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
> Copyright (c) 2008 - 2011, Charles Childers
> Copyright (c) 2009 - 2010, Luke Parrish
> Copyright (c) 2010, Marc Simpson
> Copyright (c) 2010, Jay Skeer
> Copyright (c) 2011, Kenneth Keating
> Copyright (c) 2011, Jorge Acereda
>
> ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
> */
> #include <stdint.h>
> #include <stdio.h>
> #include <stdlib.h>
> #include <time.h>
> #include <unistd.h>
> #include <stdarg.h>
> #include <string.h>
> #include <termios.h>
> #include <sys/ioctl.h>
>
> /* Configuration
> ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
>
> +---------+---------+---------+
> | 16 bit | 32 bit | 64 bit |
> +------------+---------+---------+---------+
> | IMAGE_SIZE | 32000 | 1000000 | 1000000 |
> +------------+---------+---------+---------+
> | CELL | int16_t | int32_t | int64_t |
> +------------+---------+---------+---------+
>
> If memory is tight, cut the MAX_FILE_NAME and MAX_REQUEST_LENGTH.
>
> You can also cut the ADDRESSES stack size down, but if you have
> heavy nesting or recursion this may cause problems. If you do
> modify
> it and experience odd problems, try raising it a bit higher.
>
> Use -DRX16 to select defaults for 16-bit, or -DRX64 to select the
> defaults for 64-bit. Without these, the compiler will generate a
> standard 32-bit VM.
>
> ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
> */
> #define CELL int32_t
> #define IMAGE_SIZE 1000000
> #define ADDRESSES 1024
> #define STACK_DEPTH 128
> #define PORTS 16
> #define MAX_FILE_NAME 1024
> #define MAX_REQUEST_LENGTH 1024
> #define MAX_OPEN_FILES 8
> #define LOCAL "retroImage"
>
> #ifdef RX64
> #undef CELL
> #undef LOCAL
> #define CELL int64_t
> #define LOCAL "retroImage64"
> #endif
>
> #ifdef RX16
> #undef CELL
> #undef LOCAL
> #undef IMAGE_SIZE
> #define CELL int16_t
> #define IMAGE_SIZE 32000
> #define LOCAL "retroImage16"
> #endif
>
> enum vm_opcode {VM_NOP, VM_LIT, VM_DUP, VM_DROP,
> VM_SWAP, VM_PUSH, VM_POP, VM_LOOP,
> VM_JUMP, VM_RETURN, VM_GT_JUMP, VM_LT_JUMP,
> VM_NE_JUMP,VM_EQ_JUMP, VM_FETCH, VM_STORE,
> VM_ADD, VM_SUB, VM_MUL, VM_DIVMOD,
> VM_AND, VM_OR, VM_XOR, VM_SHL,
> VM_SHR, VM_ZERO_EXIT, VM_INC, VM_DEC,
> VM_IN, VM_OUT, VM_WAIT,
> // These don't appear in the image
> VM_RESOLVE, VM_COLON, VM_BYE };
>
> typedef struct {
> CELL address[ADDRESSES];
> CELL data[STACK_DEPTH];
> CELL image[IMAGE_SIZE];
> CELL ports[PORTS];
> FILE *files[MAX_OPEN_FILES];
> FILE *input[MAX_OPEN_FILES];
> char filename[MAX_FILE_NAME];
> char request[MAX_REQUEST_LENGTH];
> struct termios new_termios, old_termios;
> CELL sp, rsp, ip, tos;
> CELL isp;
> CELL shrink;
> uint16_t shadow[IMAGE_SIZE+1];
>
> } VM;
>
> static inline void dbg(const char * fmt, ...) {
> if (0) {
> va_list ap;
> va_start(ap, fmt);
> vprintf(fmt, ap);
> printf("\n");
> va_end(ap);
> }
>
> }
>
> #define TOS vm->tos
> #define DROP TOS = vm->data[vm->sp--]
>
> /* Helper Functions
> ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */
> static void rxGetString(VM *vm, int starting)
> {
> CELL i = 0;
> while(vm->image[starting] && i < MAX_REQUEST_LENGTH)
> vm->request[i++] = (char)vm->image[starting++];
> vm->request[i] = 0;
>
> }
>
> /* Console I/O Support
> ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */
> static void rxWriteConsole(VM *vm, CELL c) {
> (c > 0) ? putchar((char)c) : printf("\033[2J\033[1;1H");
> /* Erase the previous character if c = backspace */
> if (c == 8) {
> putchar(32);
> putchar(8);
> }
>
> }
>
> static CELL rxReadConsole(VM *vm) {
> CELL c;
> if ((c = getc(vm->input[vm->isp])) == EOF && vm->input[vm->isp] !=
> stdin) {
> fclose(vm->input[vm->isp--]);
> c = 0;
> }
> if (c == EOF && vm->input[vm->isp] == stdin)
> exit(0);
> return c;
>
> }
>
> static void rxIncludeFile(VM *vm, char *s) {
> FILE *file;
> if ((file = fopen(s, "r")))
> vm->input[++vm->isp] = file;
>
> }
>
> static void rxPrepareInput(VM *vm) {
> vm->isp = 0;
> vm->input[vm->isp] = stdin;
>
> }
>
> static void rxPrepareOutput(VM *vm) {
> tcgetattr(0, &vm->old_termios);
> vm->new_termios = vm->old_termios;
> vm->new_termios.c_iflag &= ~(BRKINT+ISTRIP+IXON+IXOFF);
> vm->new_termios.c_iflag |= (IGNBRK+IGNPAR);
> vm->new_termios.c_lflag &= ~(ICANON+ISIG+IEXTEN+ECHO);
> vm->new_termios.c_cc[VMIN] = 1;
> vm->new_termios.c_cc[VTIME] = 0;
> tcsetattr(0, TCSANOW, &vm->new_termios);
>
> }
>
> static void rxRestoreIO(VM *vm) {
> tcsetattr(0, TCSANOW, &vm->old_termios);
>
> }
>
> /* File I/O Support
> ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */
> static CELL rxGetFileHandle(VM *vm)
> {
> CELL i;
> for(i = 1; i < MAX_OPEN_FILES; i++)
> if (vm->files[i] == 0)
> return i;
> return 0;
>
> }
>
> static void rxAddInputSource(VM *vm) {
> CELL name = TOS; DROP;
> rxGetString(vm, name);
> rxIncludeFile(vm, vm->request);
>
> }
>
> static CELL rxOpenFile(VM *vm) {
> CELL slot, mode, name;
> slot = rxGetFileHandle(vm);
> mode = TOS; DROP;
> name = TOS; DROP;
> rxGetString(vm, name);
> if (slot > 0)
> {
> if (mode == 0) vm->files[slot] = fopen(vm->request, "r");
> if (mode == 1) vm->files[slot] = fopen(vm->request, "w");
> if (mode == 2) vm->files[slot] = fopen(vm->request, "a");
> if (mode == 3) vm->files[slot] = fopen(vm->request, "r+");
> }
> if (vm->files[slot] == NULL)
> {
> vm->files[slot] = 0;
> slot = 0;
> }
> return slot;
>
> }
>
> static CELL rxReadFile(VM *vm) {
> CELL c;
> c = fgetc(vm->files[TOS]);
> DROP;
> return (c == EOF) ? 0 : c;
>
> }
>
> static CELL rxWriteFile(VM *vm) {
> CELL slot, c, r;
> slot = TOS; DROP;
> c = TOS; DROP;
> r = fputc(c, vm->files[slot]);
> return (r == EOF) ? 0 : 1;
>
> }
>
> static CELL rxCloseFile(VM *vm) {
> fclose(vm->files[TOS]);
> vm->files[TOS] = 0;
> DROP;
> return 0;
>
> }
>
> static CELL rxGetFilePosition(VM *vm) {
> CELL slot = TOS; DROP;
> CELL r;
> r = (CELL) ftell(vm->files[slot]);
> return r;
>
> }
>
> static CELL rxSetFilePosition(VM *vm) {
> CELL slot, pos, r;
> slot = TOS; DROP;
> pos = TOS; DROP;
> r = fseek(vm->files[slot], pos, SEEK_SET);
> return r;
>
> }
>
> static CELL rxGetFileSize(VM *vm) {
> CELL slot, current, r, size;
> slot = TOS; DROP;
> current = ftell(vm->files[slot]);
> r = fseek(vm->files[slot], 0, SEEK_END);
> size = ftell(vm->files[slot]);
> fseek(vm->files[slot], current, SEEK_SET);
> return (r == 0) ? size : 0;
>
> }
>
> static CELL rxDeleteFile(VM *vm) {
> CELL r;
> CELL name = TOS; DROP;
> rxGetString(vm, name);
> r = (unlink(vm->request) == 0) ? -1 : 0;
> return r;
>
> }
>
> static CELL rxLoadImage(VM *vm, const char *image) {
> FILE *fp;
> CELL x = 0;
> if ((fp = fopen(image, "rb")) != NULL) {
> x = fread(vm->image, sizeof(CELL), IMAGE_SIZE, fp);
> fclose(fp);
> }
> return x;
>
> }
>
> static CELL rxSaveImage(VM *vm, char *image) {
> FILE *fp;
> CELL x = 0;
> if ((fp = fopen(image, "wb")) == NULL)
> {
> printf("Unable to save the retroImage!\n");
> rxRestoreIO(vm);
> exit(2);
> }
> x = fwrite(&vm->image, sizeof(CELL),
> vm->shrink? vm->image[3] : IMAGE_SIZE, fp);
> fclose(fp);
> return x;
>
> }
>
> /* Environment Query
> ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */
> static void rxQueryEnvironment(VM *vm) {
> CELL req, dest;
> char *r;
> req = TOS; DROP;
> dest = TOS; DROP;
> rxGetString(vm, req);
> r = getenv(vm->request);
> if (r != 0)
> while (*r != '\0')
> {
> vm->image[dest] = *r;
> dest++;
> r++;
> }
> else
> vm->image[dest] = 0;
>
> }
>
> /* Device I/O Handler
> ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */
> static void rxDeviceHandler(VM *vm) {
> struct winsize w;
> if (vm->ports[0] != 1) {
> /* Input */
> if (vm->ports[0] == 0 && vm->ports[1] == 1) {
> vm->ports[1] = rxReadConsole(vm);
> vm->ports[0] = 1;
> }
>
> /* Output (character generator) */
> if (vm->ports[2] == 1) {
> rxWriteConsole(vm, TOS); DROP;
> vm->ports[2] = 0;
> vm->ports[0] = 1;
> }
>
> /* File IO and Image Saving */
> if (vm->ports[4] != 0) {
> vm->ports[0] = 1;
> switch (vm->ports[4]) {
> case 1: rxSaveImage(vm, vm->filename);
> vm->ports[4] = 0;
> break;
> case 2: rxAddInputSource(vm);
> vm->ports[4] = 0;
> break;
> case -1: vm->ports[4] = rxOpenFile(vm);
> break;
> case -2: vm->ports[4] = rxReadFile(vm);
> break;
> case -3: vm->ports[4] = rxWriteFile(vm);
> break;
> case -4: vm->ports[4] = rxCloseFile(vm);
> break;
> case...
>
> Erfahren Sie mehr »
The idea is somewhat similar to context threading, without using
subroutine branches for BTB synchronisation of course.
[toc] | [prev] | [standalone]
Back to top | Article view | comp.lang.forth
csiph-web