docs + prefix checks

master
Jordan Orelli 10 years ago
parent 625bc97329
commit 460eeb1257

@ -5,3 +5,101 @@ ideas presented in [Peter Seymour's Efficient Lexicographic Encoding of
Numbers](elen.pdf). The paper's original source can be found at Numbers](elen.pdf). The paper's original source can be found at
[http://www.zanopha.com/docs/elen.pdf](http://www.zanopha.com/docs/elen.pdf), [http://www.zanopha.com/docs/elen.pdf](http://www.zanopha.com/docs/elen.pdf),
but it is re-hosted [here](elen.pdf) for posterity. but it is re-hosted [here](elen.pdf) for posterity.
Numbers are ordered. That's part of their whole thing, what with them being
numbers and all. 1 comes before 2, and 2 comes before 3, and so on, and so
forth. If you sort a list of integers like `[1, 9, 4, 10, 13, 31]` you'll get
them back in their normal ordering: `[1, 4, 9, 10, 13, 31]`. If each of those
were to be strings, such that the input is `['1', '9', '4', '10', '13', '31']`,
sorting them lexically would not produce a proper numerical sorting, you would
wind up with `['1', '10', '13', '31', '4', '9']`. That is quite annoying when
doing things like appending numbers to identical file names or basically
anything where you wish to put a number in the middle of what is otherwise a
string.
You could always just pad your numbers with zeroes, so the prior example would
have an input more like `['01', '09', '04', '10', '13', '31']`. This works
fine, but it requires that you know something about your input in advance.
Namely, it requires that you know the range of the input, which may not always
be the case. More subtly, it also requires that you know that you're only
dealing with positive integers; even with zero padding, negative numbers would
lexically sort backwards (e.g., `'+1'` lexically precedes `'+2'`, which makes
sense because 1 is less than 2, but it's also the case that `'-1'` precedes
`'-2'`, which makes no sense, since -2 is less than -1). Doubly subtle is the
fact that the only reason a negative number string precedes a positive number
string is that the negative character precedes the zero character in the ascii
table. Triply subtle is that if you just always put a sign in front of your
numbers, the positive numbers will precede the negative numbers because the `+`
character precedes the `-` character in ascii. The end result is that a list
like `['+1', '+4', '-9', '+10', '-13', '+31']` would lexically sort to `['+1',
'+10', '+31', '+4', '-13', '-9']`.
For a full description of how the problem is solved, read [the white
paper](elen.pdf).
## example
The following program would count from -20 to 20 and print their lexnum
strings:
```go
package main
import (
"fmt"
"github.com/jordanorelli/lexnum"
)
func main() {
e := lexnum.NewEncoder('=', '-')
for i := -20; i <= 20; i++ {
fmt.Printf("%-12s%d\n", e.EncodeInt(i), i)
}
}
```
Running it would produce the following output:
```
--779 -20
--780 -19
--781 -18
--782 -17
--783 -16
--784 -15
--785 -14
--786 -13
--787 -12
--788 -11
--789 -10
-0 -9
-1 -8
-2 -7
-3 -6
-4 -5
-5 -4
-6 -3
-7 -2
-8 -1
0 0
=1 1
=2 2
=3 3
=4 4
=5 5
=6 6
=7 7
=8 8
=9 9
==210 10
==211 11
==212 12
==213 13
==214 14
==215 15
==216 16
==217 17
==218 18
==219 19
==220 20
```

@ -1,3 +1,13 @@
// package lexnum provides an efficient lexicographic encoding of numbers as
// described here: http://www.zanopha.com/docs/elen.pdf
//
// lexnum allows a developer to encode integers as strings, such that the
// strings may be lexically sorted, preserving their numerical orderings.
//
// e.g., if one were to sort the numbers 0, 1, 2, 9, 10 lexically, we would
// wind up with 0, 1, 10, 2, 9. If we knew the maximum value, we could
// prescribe some zero-padding. Failing that, we would need a lexicographic
// encoding of the numbers. Lexnum attempts to provide this alternative.
package lexnum package lexnum
import ( import (
@ -5,18 +15,29 @@ import (
"strconv" "strconv"
) )
// an Encoder may be used to encode or decode an integer as a string. The
// produced strings will have the property that any set of numbers will have
// the same lexical sorting and numeric sorting.
type Encoder struct { type Encoder struct {
pos rune pos rune
neg rune neg rune
} }
// NewEncoder creates a new lexnum Encoder. We achieve
func NewEncoder(pos rune, neg rune) *Encoder { func NewEncoder(pos rune, neg rune) *Encoder {
if pos < neg { if pos < neg {
panic("positive lexnum rune must be of higher rank than negative lexnum rune") panic("positive lexnum rune must be of higher rank than negative lexnum rune")
} }
if neg >= '0' {
panic("negative prefix must be lexically less than '0'")
}
if pos <= '9' {
panic("positive prefix must be lexically greather than '9'")
}
return &Encoder{pos: pos, neg: neg} return &Encoder{pos: pos, neg: neg}
} }
// Encodes an integer as a string.
func (l Encoder) EncodeInt(i int) string { func (l Encoder) EncodeInt(i int) string {
if i == 0 { if i == 0 {
return "0" return "0"
@ -98,6 +119,7 @@ func (l Encoder) prefixCount(runes []rune) int {
return i return i
} }
// Decodes a lexnum string, returning its original integer representation.
func (l Encoder) DecodeInt(s string) (int, error) { func (l Encoder) DecodeInt(s string) (int, error) {
if s == "" { if s == "" {
return 0, fmt.Errorf("illegal Lexnum decode of empty string") return 0, fmt.Errorf("illegal Lexnum decode of empty string")

@ -12,27 +12,27 @@ var lexNumTests = []struct {
out string out string
}{ }{
{0, "0"}, {0, "0"},
{1, "x1"}, {1, "=1"},
{9, "x9"}, {9, "=9"},
{10, "xx210"}, {10, "==210"},
{99, "xx299"}, {99, "==299"},
{100, "xx3100"}, {100, "==3100"},
{12345, "xx512345"}, {12345, "==512345"},
{123456789, "xx9123456789"}, {123456789, "==9123456789"},
{1234567890, "xxx2101234567890"}, {1234567890, "===2101234567890"},
{-1, "o8"}, {-1, "-8"},
{-2, "o7"}, {-2, "-7"},
{-9, "o0"}, {-9, "-0"},
{-10, "oo789"}, {-10, "--789"},
{-11, "oo788"}, {-11, "--788"},
{-123, "oo6876"}, {-123, "--6876"},
{-123456789, "oo0876543210"}, {-123456789, "--0876543210"},
{-1234567890, "ooo7898765432109"}, {-1234567890, "---7898765432109"},
} }
func TestLexnum(t *testing.T) { func TestLexnum(t *testing.T) {
rand.Seed(time.Now().UnixNano()) rand.Seed(time.Now().UnixNano())
e := NewEncoder('x', 'o') e := NewEncoder('=', '-')
for _, test := range lexNumTests { for _, test := range lexNumTests {
s := e.EncodeInt(test.in) s := e.EncodeInt(test.in)
t.Logf("%d -> %s", test.in, test.out) t.Logf("%d -> %s", test.in, test.out)
@ -49,6 +49,28 @@ func TestLexnum(t *testing.T) {
} }
} }
// -150 to 150 test
nums, stringz := make([]int, 0, 301), make([]string, 0, 301)
for x := -150; x <= 150; x += 1 {
nums = append(nums, x)
stringz = append(stringz, e.EncodeInt(x))
}
sort.Strings(stringz)
sort.Ints(nums)
for i := 0; i < len(nums); i++ {
n, err := e.DecodeInt(stringz[i])
if err != nil {
t.Errorf("unable to decode our own input: %v", stringz[i])
continue
}
if n != nums[i] {
t.Errorf("sorting is broken in range test")
t.Log(stringz, "\n", nums)
break
}
}
// random test
runsize := 8 runsize := 8
for runz := 0; runz < 4; runz += 1 { for runz := 0; runz < 4; runz += 1 {
nums, stringz := make([]int, runsize), make([]string, runsize) nums, stringz := make([]int, runsize), make([]string, runsize)

Loading…
Cancel
Save