From 460eeb125754d395da7c4dc1680a647dcb0e5bea Mon Sep 17 00:00:00 2001 From: Jordan Orelli Date: Tue, 16 Dec 2014 10:11:56 -0500 Subject: [PATCH] docs + prefix checks --- README.md | 98 ++++++++++++++++++++++++++++++++++++++++++++++++++ lexnum.go | 22 ++++++++++++ lexnum_test.go | 56 ++++++++++++++++++++--------- 3 files changed, 159 insertions(+), 17 deletions(-) diff --git a/README.md b/README.md index 01b3ce5..cbf484b 100644 --- a/README.md +++ b/README.md @@ -5,3 +5,101 @@ ideas presented in [Peter Seymour's Efficient Lexicographic Encoding of Numbers](elen.pdf). The paper's original source can be found at [http://www.zanopha.com/docs/elen.pdf](http://www.zanopha.com/docs/elen.pdf), but it is re-hosted [here](elen.pdf) for posterity. + +Numbers are ordered. That's part of their whole thing, what with them being +numbers and all. 1 comes before 2, and 2 comes before 3, and so on, and so +forth. If you sort a list of integers like `[1, 9, 4, 10, 13, 31]` you'll get +them back in their normal ordering: `[1, 4, 9, 10, 13, 31]`. If each of those +were to be strings, such that the input is `['1', '9', '4', '10', '13', '31']`, +sorting them lexically would not produce a proper numerical sorting, you would +wind up with `['1', '10', '13', '31', '4', '9']`. That is quite annoying when +doing things like appending numbers to identical file names or basically +anything where you wish to put a number in the middle of what is otherwise a +string. + +You could always just pad your numbers with zeroes, so the prior example would +have an input more like `['01', '09', '04', '10', '13', '31']`. This works +fine, but it requires that you know something about your input in advance. +Namely, it requires that you know the range of the input, which may not always +be the case. More subtly, it also requires that you know that you're only +dealing with positive integers; even with zero padding, negative numbers would +lexically sort backwards (e.g., `'+1'` lexically precedes `'+2'`, which makes +sense because 1 is less than 2, but it's also the case that `'-1'` precedes +`'-2'`, which makes no sense, since -2 is less than -1). Doubly subtle is the +fact that the only reason a negative number string precedes a positive number +string is that the negative character precedes the zero character in the ascii +table. Triply subtle is that if you just always put a sign in front of your +numbers, the positive numbers will precede the negative numbers because the `+` +character precedes the `-` character in ascii. The end result is that a list +like `['+1', '+4', '-9', '+10', '-13', '+31']` would lexically sort to `['+1', +'+10', '+31', '+4', '-13', '-9']`. + +For a full description of how the problem is solved, read [the white +paper](elen.pdf). + +## example + +The following program would count from -20 to 20 and print their lexnum +strings: + +```go +package main + +import ( + "fmt" + "github.com/jordanorelli/lexnum" +) + +func main() { + e := lexnum.NewEncoder('=', '-') + for i := -20; i <= 20; i++ { + fmt.Printf("%-12s%d\n", e.EncodeInt(i), i) + } +} +``` + +Running it would produce the following output: +``` +--779 -20 +--780 -19 +--781 -18 +--782 -17 +--783 -16 +--784 -15 +--785 -14 +--786 -13 +--787 -12 +--788 -11 +--789 -10 +-0 -9 +-1 -8 +-2 -7 +-3 -6 +-4 -5 +-5 -4 +-6 -3 +-7 -2 +-8 -1 +0 0 +=1 1 +=2 2 +=3 3 +=4 4 +=5 5 +=6 6 +=7 7 +=8 8 +=9 9 +==210 10 +==211 11 +==212 12 +==213 13 +==214 14 +==215 15 +==216 16 +==217 17 +==218 18 +==219 19 +==220 20 +``` + diff --git a/lexnum.go b/lexnum.go index 8df1d0f..514ae9a 100644 --- a/lexnum.go +++ b/lexnum.go @@ -1,3 +1,13 @@ +// package lexnum provides an efficient lexicographic encoding of numbers as +// described here: http://www.zanopha.com/docs/elen.pdf +// +// lexnum allows a developer to encode integers as strings, such that the +// strings may be lexically sorted, preserving their numerical orderings. +// +// e.g., if one were to sort the numbers 0, 1, 2, 9, 10 lexically, we would +// wind up with 0, 1, 10, 2, 9. If we knew the maximum value, we could +// prescribe some zero-padding. Failing that, we would need a lexicographic +// encoding of the numbers. Lexnum attempts to provide this alternative. package lexnum import ( @@ -5,18 +15,29 @@ import ( "strconv" ) +// an Encoder may be used to encode or decode an integer as a string. The +// produced strings will have the property that any set of numbers will have +// the same lexical sorting and numeric sorting. type Encoder struct { pos rune neg rune } +// NewEncoder creates a new lexnum Encoder. We achieve func NewEncoder(pos rune, neg rune) *Encoder { if pos < neg { panic("positive lexnum rune must be of higher rank than negative lexnum rune") } + if neg >= '0' { + panic("negative prefix must be lexically less than '0'") + } + if pos <= '9' { + panic("positive prefix must be lexically greather than '9'") + } return &Encoder{pos: pos, neg: neg} } +// Encodes an integer as a string. func (l Encoder) EncodeInt(i int) string { if i == 0 { return "0" @@ -98,6 +119,7 @@ func (l Encoder) prefixCount(runes []rune) int { return i } +// Decodes a lexnum string, returning its original integer representation. func (l Encoder) DecodeInt(s string) (int, error) { if s == "" { return 0, fmt.Errorf("illegal Lexnum decode of empty string") diff --git a/lexnum_test.go b/lexnum_test.go index 676fd6c..0f9889c 100644 --- a/lexnum_test.go +++ b/lexnum_test.go @@ -12,27 +12,27 @@ var lexNumTests = []struct { out string }{ {0, "0"}, - {1, "x1"}, - {9, "x9"}, - {10, "xx210"}, - {99, "xx299"}, - {100, "xx3100"}, - {12345, "xx512345"}, - {123456789, "xx9123456789"}, - {1234567890, "xxx2101234567890"}, - {-1, "o8"}, - {-2, "o7"}, - {-9, "o0"}, - {-10, "oo789"}, - {-11, "oo788"}, - {-123, "oo6876"}, - {-123456789, "oo0876543210"}, - {-1234567890, "ooo7898765432109"}, + {1, "=1"}, + {9, "=9"}, + {10, "==210"}, + {99, "==299"}, + {100, "==3100"}, + {12345, "==512345"}, + {123456789, "==9123456789"}, + {1234567890, "===2101234567890"}, + {-1, "-8"}, + {-2, "-7"}, + {-9, "-0"}, + {-10, "--789"}, + {-11, "--788"}, + {-123, "--6876"}, + {-123456789, "--0876543210"}, + {-1234567890, "---7898765432109"}, } func TestLexnum(t *testing.T) { rand.Seed(time.Now().UnixNano()) - e := NewEncoder('x', 'o') + e := NewEncoder('=', '-') for _, test := range lexNumTests { s := e.EncodeInt(test.in) t.Logf("%d -> %s", test.in, test.out) @@ -49,6 +49,28 @@ func TestLexnum(t *testing.T) { } } + // -150 to 150 test + nums, stringz := make([]int, 0, 301), make([]string, 0, 301) + for x := -150; x <= 150; x += 1 { + nums = append(nums, x) + stringz = append(stringz, e.EncodeInt(x)) + } + sort.Strings(stringz) + sort.Ints(nums) + for i := 0; i < len(nums); i++ { + n, err := e.DecodeInt(stringz[i]) + if err != nil { + t.Errorf("unable to decode our own input: %v", stringz[i]) + continue + } + if n != nums[i] { + t.Errorf("sorting is broken in range test") + t.Log(stringz, "\n", nums) + break + } + } + + // random test runsize := 8 for runz := 0; runz < 4; runz += 1 { nums, stringz := make([]int, runsize), make([]string, runsize)