Source code to 'How tux21b fixed my regex' article

00 meisterluk's version

echo "meisterluk, overall + critical path, time\n"
go run 00.go
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142
Go
§
package main

import (
	"bytes"
	"fmt"
	"os"
	"regexp"
	"strconv"
	"strings"
	"time"
)

// Generate a regex accepting any string but the string of concatenated numbers
// lower_bound to upper_bound (each inclusive).
func generate_regex(lower_bound, upper_bound, mode int) string {
	var not_str bytes.Buffer
	var buf bytes.Buffer

	// generate not_string
	for number := lower_bound; number <= upper_bound; number++ {
		not_str.WriteString(strconv.Itoa(number))
	}
	not_string := not_str.String()
	l := len(not_string)

	// mode 3 = lookahead
	if mode == 3 {
		buf.WriteString("^(?!")
		buf.WriteString(not_string)
		buf.WriteString("$).*$")
		return buf.String()
	}

	buf.WriteString("^(")

	// strings with length smaller than not_string
	// mode 1 == |.|..|...|…
	if mode == 1 {
		for i := 0; i < l; i++ {
			buf.WriteString(strings.Repeat(".", i))
			if i != l-1 {
				buf.WriteString("|")
			}
		}
		// mode 2 == .?.?.?.?…
	} else if mode == 2 {
		buf.WriteString(strings.Repeat(".?", l-1))
	}

	buf.WriteString("|")

	// strings with length equal to not_string
	for index := 0; index < l; index++ {
		buf.WriteString(strings.Repeat(".", index))
		buf.WriteString("[^")
		buf.WriteString(string(not_string[index]))
		buf.WriteString("]")
		buf.WriteString(strings.Repeat(".", l-index-1))
		if index != l-1 {
			buf.WriteString("|")
		}
	}

	buf.WriteString("|")

	// strings with length greater than not_string

	buf.WriteString(strings.Repeat(".", l))
	buf.WriteString(".+")

	buf.WriteString(")$")

	return buf.String()
}

// Generate less than `(upper_bound - lower_bound)^2` input strings.
// Returns testsuite dict.
func generate_input_strings(lower_bound, upper_bound int) map[string]bool {
	testsuite := make(map[string]bool)

	// generate not_string
	var not_str bytes.Buffer
	for number := lower_bound; number <= upper_bound; number++ {
		not_str.WriteString(strconv.Itoa(number))
	}
	not_string := not_str.String()

	// generate strings
	for begin := lower_bound; begin <= upper_bound; begin++ {
		for end := begin + 1; end <= upper_bound; end++ {
			var input_string = not_string[begin:end]
			matches := (input_string != not_string)
			testsuite[input_string] = matches
		}
	}

	return testsuite
}

// A helper for measuring time
func trackingTime(begin time.Time) {
	elapsed := time.Since(begin)
	fmt.Printf("It takes %s to test all input strings.", elapsed)
}

// Actually run all pattern searches.
func run_test(pat *regexp.Regexp, testsuite map[string]bool) {
	defer trackingTime(time.Now())

	for input, match := range testsuite {
		//mat := pat.FindStringIndex(input)
		//matches := (mat != nil)
		matches := pat.MatchString(input)

		if match != matches {
			fmt.Printf("FAIL for %s (should be %t)\n", input, matches)
			return
		}
	}
}

func main() {
	low := 1
	upp := 500

	regex := generate_regex(low, upp, 2)
	fmt.Println(regex)
	fmt.Println("\n\n\n\n")
	testsuite := generate_input_strings(low, upp)

	fmt.Printf("Generated regex of string length %d.\n", len(regex))
	fmt.Printf("Generated %d input strings.\n", len(testsuite))

	pat := regexp.MustCompile(regex)

	fmt.Println("Regex compilation has finished.\n")

	run_test(pat, testsuite)

	os.Exit(0)
}

01 removed benchmarking from 00

echo "meisterluk, overall path\n"
go run 01.go
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133
Go
§
package main

import (
  "bytes"
  "fmt"
  "os"
  "regexp"
  "strconv"
  "strings"
)

// Generate a regex accepting any string but the string of concatenated numbers
// lower_bound to upper_bound (each inclusive).
func generate_regex(lower_bound, upper_bound, mode int) string {
  var not_str bytes.Buffer
  var buf bytes.Buffer

  // generate not_string
  for number := lower_bound; number <= upper_bound; number++ {
    not_str.WriteString(strconv.Itoa(number))
  }
  not_string := not_str.String()
  l := len(not_string)

  // mode 3 = lookahead
  if mode == 3 {
    buf.WriteString("^(?!")
    buf.WriteString(not_string)
    buf.WriteString("$).*$")
    return buf.String()
  }

  buf.WriteString("^(")

  // strings with length smaller than not_string
  // mode 1 == |.|..|...|…
  if mode == 1 {
    for i := 0; i < l; i++ {
      buf.WriteString(strings.Repeat(".", i))
      if i != l-1 {
        buf.WriteString("|")
      }
    }
    // mode 2 == .?.?.?.?…
  } else if mode == 2 {
    buf.WriteString(strings.Repeat(".?", l-1))
  }

  buf.WriteString("|")

  // strings with length equal to not_string
  for index := 0; index < l; index++ {
    buf.WriteString(strings.Repeat(".", index))
    buf.WriteString("[^")
    buf.WriteString(string(not_string[index]))
    buf.WriteString("]")
    buf.WriteString(strings.Repeat(".", l-index-1))
    if index != l-1 {
      buf.WriteString("|")
    }
  }

  buf.WriteString("|")

  // strings with length greater than not_string

  buf.WriteString(strings.Repeat(".", l))
  buf.WriteString(".+")

  buf.WriteString(")$")

  return buf.String()
}

// Generate less than `(upper_bound - lower_bound)^2` input strings.
// Returns testsuite dict.
func generate_input_strings(lower_bound, upper_bound int) map[string]bool {
  testsuite := make(map[string]bool)

  // generate not_string
  var not_str bytes.Buffer
  for number := lower_bound; number <= upper_bound; number++ {
    not_str.WriteString(strconv.Itoa(number))
  }
  not_string := not_str.String()

  // generate strings
  for begin := lower_bound; begin <= upper_bound; begin++ {
    for end := begin + 1; end <= upper_bound; end++ {
      var input_string = not_string[begin:end]
      matches := (input_string != not_string)
      testsuite[input_string] = matches
    }
  }

  return testsuite
}

// Actually run all pattern searches.
func run_test(pat *regexp.Regexp, testsuite map[string]bool) {
  for input, match := range testsuite {
    //mat := pat.FindStringIndex(input)
    //matches := (mat != nil)
    matches := pat.MatchString(input)

    if match != matches {
      fmt.Printf("FAIL for %s (should be %t)\n", input, matches)
      return
    }
  }
}

func main() {
  low := 1
  upp := 500

  regex := generate_regex(low, upp, 2)
  fmt.Println(regex)
  fmt.Println("\n\n\n\n")
  testsuite := generate_input_strings(low, upp)

  fmt.Printf("Generated regex of string length %d.\n", len(regex))
  fmt.Printf("Generated %d input strings.\n", len(testsuite))

  pat := regexp.MustCompile(regex)

  fmt.Println("Regex compilation has finished.\n")

  run_test(pat, testsuite)

  os.Exit(0)
}

02 meisterluk source code with testing

echo "meisterluk, overall + critical path, testing\n"
go test -bench RegexpNot 02_test.go
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138
Go
§
package main

import (
  "bytes"
  "fmt"
  "os"
  "regexp"
  "strconv"
  "strings"
  "testing"
)

// Generate a regex accepting any string but the string of concatenated numbers
// lower_bound to upper_bound (each inclusive).
func generate_regex(lower_bound, upper_bound, mode int) string {
  var not_str bytes.Buffer
  var buf bytes.Buffer

  // generate not_string
  for number := lower_bound; number <= upper_bound; number++ {
    not_str.WriteString(strconv.Itoa(number))
  }
  not_string := not_str.String()
  l := len(not_string)

  // mode 3 = lookahead
  if mode == 3 {
    buf.WriteString("^(?!")
    buf.WriteString(not_string)
    buf.WriteString("$).*$")
    return buf.String()
  }

  buf.WriteString("^(")

  // strings with length smaller than not_string
  // mode 1 == |.|..|...|…
  if mode == 1 {
    for i := 0; i < l; i++ {
      buf.WriteString(strings.Repeat(".", i))
      if i != l-1 {
        buf.WriteString("|")
      }
    }
    // mode 2 == .?.?.?.?…
  } else if mode == 2 {
    buf.WriteString(strings.Repeat(".?", l-1))
  }

  buf.WriteString("|")

  // strings with length equal to not_string
  for index := 0; index < l; index++ {
    buf.WriteString(strings.Repeat(".", index))
    buf.WriteString("[^")
    buf.WriteString(string(not_string[index]))
    buf.WriteString("]")
    buf.WriteString(strings.Repeat(".", l-index-1))
    if index != l-1 {
      buf.WriteString("|")
    }
  }

  buf.WriteString("|")

  // strings with length greater than not_string

  buf.WriteString(strings.Repeat(".", l))
  buf.WriteString(".+")

  buf.WriteString(")$")

  return buf.String()
}

// Generate less than `(upper_bound - lower_bound)^2` input strings.
// Returns testsuite dict.
func generate_input_strings(lower_bound, upper_bound int) map[string]bool {
  testsuite := make(map[string]bool)

  // generate not_string
  var not_str bytes.Buffer
  for number := lower_bound; number <= upper_bound; number++ {
    not_str.WriteString(strconv.Itoa(number))
  }
  not_string := not_str.String()

  // generate strings
  for begin := lower_bound; begin <= upper_bound; begin++ {
    for end := begin + 1; end <= upper_bound; end++ {
      var input_string = not_string[begin:end]
      matches := (input_string != not_string)
      testsuite[input_string] = matches
    }
  }

  return testsuite
}

// Actually run all pattern searches.
func run_test(pat *regexp.Regexp, testsuite map[string]bool) {
  for input, match := range testsuite {
    //mat := pat.FindStringIndex(input)
    //matches := (mat != nil)
    matches := pat.MatchString(input)

    if match != matches {
      fmt.Printf("FAIL for %s (should be %t)\n", input, matches)
      return
    }
  }
}

func BenchmarkRegexpNot(b *testing.B) {
  b.StopTimer()
  low := 1
  upp := 500

  regex := generate_regex(low, upp, 2)
  fmt.Println(regex)
  fmt.Println("\n\n\n\n")
  testsuite := generate_input_strings(low, upp)

  fmt.Printf("Generated regex of string length %d.\n", len(regex))
  fmt.Printf("Generated %d input strings.\n", len(testsuite))

  pat := regexp.MustCompile(regex)

  fmt.Println("Regex compilation has finished.\n")

  b.StartTimer()
  for i := 0; i < b.N; i++ {
    run_test(pat, testsuite)
  }

  os.Exit(0)
}

03 runtime/pprof for 01

echo "meisterluk, overall path, runtime/pprof"
echo "Generates a 03.prof file\n"
go build 03.go
./03
go tool pprof --web 03 03.prof
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141
Go
§
package main

import (
  "bytes"
  "fmt"
  "os"
  "regexp"
  "runtime/pprof"
  "strconv"
  "strings"
)

// Generate a regex accepting any string but the string of concatenated numbers
// lower_bound to upper_bound (each inclusive).
func generate_regex(lower_bound, upper_bound, mode int) string {
  var not_str bytes.Buffer
  var buf bytes.Buffer

  // generate not_string
  for number := lower_bound; number <= upper_bound; number++ {
    not_str.WriteString(strconv.Itoa(number))
  }
  not_string := not_str.String()
  l := len(not_string)

  // mode 3 = lookahead
  if mode == 3 {
    buf.WriteString("^(?!")
    buf.WriteString(not_string)
    buf.WriteString("$).*$")
    return buf.String()
  }

  buf.WriteString("^(")

  // strings with length smaller than not_string
  // mode 1 == |.|..|...|…
  if mode == 1 {
    for i := 0; i < l; i++ {
      buf.WriteString(strings.Repeat(".", i))
      if i != l-1 {
        buf.WriteString("|")
      }
    }
    // mode 2 == .?.?.?.?…
  } else if mode == 2 {
    buf.WriteString(strings.Repeat(".?", l-1))
  }

  buf.WriteString("|")

  // strings with length equal to not_string
  for index := 0; index < l; index++ {
    buf.WriteString(strings.Repeat(".", index))
    buf.WriteString("[^")
    buf.WriteString(string(not_string[index]))
    buf.WriteString("]")
    buf.WriteString(strings.Repeat(".", l-index-1))
    if index != l-1 {
      buf.WriteString("|")
    }
  }

  buf.WriteString("|")

  // strings with length greater than not_string

  buf.WriteString(strings.Repeat(".", l))
  buf.WriteString(".+")

  buf.WriteString(")$")

  return buf.String()
}

// Generate less than `(upper_bound - lower_bound)^2` input strings.
// Returns testsuite dict.
func generate_input_strings(lower_bound, upper_bound int) map[string]bool {
  testsuite := make(map[string]bool)

  // generate not_string
  var not_str bytes.Buffer
  for number := lower_bound; number <= upper_bound; number++ {
    not_str.WriteString(strconv.Itoa(number))
  }
  not_string := not_str.String()

  // generate strings
  for begin := lower_bound; begin <= upper_bound; begin++ {
    for end := begin + 1; end <= upper_bound; end++ {
      var input_string = not_string[begin:end]
      matches := (input_string != not_string)
      testsuite[input_string] = matches
    }
  }

  return testsuite
}

// Actually run all pattern searches.
func run_test(pat *regexp.Regexp, testsuite map[string]bool) {
  for input, match := range testsuite {
    //mat := pat.FindStringIndex(input)
    //matches := (mat != nil)
    matches := pat.MatchString(input)

    if match != matches {
      fmt.Printf("FAIL for %s (should be %t)\n", input, matches)
      return
    }
  }
}

func main() {
  f, err := os.Create("01_profiled.prof")
  if err != nil {
    panic(err)
  }
  pprof.StartCPUProfile(f)
  defer pprof.StopCPUProfile()

  low := 1
  upp := 500

  regex := generate_regex(low, upp, 2)
  fmt.Println(regex)
  fmt.Println("\n\n\n\n")
  testsuite := generate_input_strings(low, upp)

  fmt.Printf("Generated regex of string length %d.\n", len(regex))
  fmt.Printf("Generated %d input strings.\n", len(testsuite))

  pat := regexp.MustCompile(regex)

  fmt.Println("Regex compilation has finished.\n")

  run_test(pat, testsuite)

  os.Exit(0)
}

04 refactored main routine

echo "meisterluk, refactored, overall path\n"
go run 04.go
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100
Go
§
package main

import (
  "bytes"
  "fmt"
  "regexp"
  "strconv"
  "strings"
)

// Generate a regex accepting any string but the string of concatenated numbers
// lower_bound to upper_bound (each inclusive).
func generateRegexp(lower_bound, upper_bound int) string {
  var not_str bytes.Buffer
  var buf bytes.Buffer

  // generate not_string
  for number := lower_bound; number <= upper_bound; number++ {
    not_str.WriteString(strconv.Itoa(number))
  }
  not_string := not_str.String()
  l := len(not_string)

  buf.WriteString("^(")
  buf.WriteString(strings.Repeat(".?", l-1))
  buf.WriteString("|")

  // strings with length equal to not_string
  for index := 0; index < l; index++ {
    buf.WriteString(strings.Repeat(".", index))
    buf.WriteString("[^")
    buf.WriteString(string(not_string[index]))
    buf.WriteString("]")
    buf.WriteString(strings.Repeat(".", l-index-1))
    if index != l-1 {
      buf.WriteString("|")
    }
  }

  buf.WriteString("|")

  // strings with length greater than not_string

  buf.WriteString(strings.Repeat(".", l))
  buf.WriteString(".+")

  buf.WriteString(")$")

  return buf.String()
}

// Generate less than `(upper_bound - lower_bound)^2` input strings.
// Returns testsuite dict.
func generateTests(lower_bound, upper_bound int) map[string]bool {
  testsuite := make(map[string]bool)

  // generate not_string
  var not_str bytes.Buffer
  for number := lower_bound; number <= upper_bound; number++ {
    not_str.WriteString(strconv.Itoa(number))
  }
  not_string := not_str.String()

  // generate strings
  for begin := lower_bound; begin <= upper_bound; begin++ {
    for end := begin + 1; end <= upper_bound; end++ {
      var input_string = not_string[begin:end]
      matches := (input_string != not_string)
      testsuite[input_string] = matches
    }
  }

  return testsuite
}

// Actually run all pattern searches.
func run_test(pat *regexp.Regexp, testsuite map[string]bool) {
  for input, match := range testsuite {
    //mat := pat.FindStringIndex(input)
    //matches := (mat != nil)
    matches := pat.MatchString(input)

    if match != matches {
      fmt.Printf("FAIL for %s (should be %t)\n", input, matches)
      return
    }
  }
}

func main() {
  lower, upper := 1, 500
  regex := generateRegexp(lower, upper)
  tests := generateTests(lower, upper)
  re, err := regexp.Compile(regex)
  if err != nil {
    panic(fmt.Sprintf("regexp.Compile:", err))
  }
  run_test(re, tests)
}

05 BenchmarkGenerateRegexp

echo "benchmarking regex generation: meisterluk, testing\n"
go test -bench GenerateRegexp 05_test.go
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109
Go
§
package main

import (
  "bytes"
  "fmt"
  "regexp"
  "strconv"
  "strings"
  "testing"
)

// Generate a regex accepting any string but the string of concatenated numbers
// lower_bound to upper_bound (each inclusive).
func generateRegexp(lower_bound, upper_bound int) string {
  var not_str bytes.Buffer
  var buf bytes.Buffer

  // generate not_string
  for number := lower_bound; number <= upper_bound; number++ {
    not_str.WriteString(strconv.Itoa(number))
  }
  not_string := not_str.String()
  l := len(not_string)

  buf.WriteString("^(")
  buf.WriteString(strings.Repeat(".?", l-1))
  buf.WriteString("|")

  // strings with length equal to not_string
  for index := 0; index < l; index++ {
    buf.WriteString(strings.Repeat(".", index))
    buf.WriteString("[^")
    buf.WriteString(string(not_string[index]))
    buf.WriteString("]")
    buf.WriteString(strings.Repeat(".", l-index-1))
    if index != l-1 {
      buf.WriteString("|")
    }
  }

  buf.WriteString("|")

  // strings with length greater than not_string

  buf.WriteString(strings.Repeat(".", l))
  buf.WriteString(".+")

  buf.WriteString(")$")

  return buf.String()
}

// Generate less than `(upper_bound - lower_bound)^2` input strings.
// Returns testsuite dict.
func generateTests(lower_bound, upper_bound int) map[string]bool {
  testsuite := make(map[string]bool)

  // generate not_string
  var not_str bytes.Buffer
  for number := lower_bound; number <= upper_bound; number++ {
    not_str.WriteString(strconv.Itoa(number))
  }
  not_string := not_str.String()

  // generate strings
  for begin := lower_bound; begin <= upper_bound; begin++ {
    for end := begin + 1; end <= upper_bound; end++ {
      var input_string = not_string[begin:end]
      matches := (input_string != not_string)
      testsuite[input_string] = matches
    }
  }

  return testsuite
}

// Actually run all pattern searches.
func run_test(pat *regexp.Regexp, testsuite map[string]bool) {
  for input, match := range testsuite {
    //mat := pat.FindStringIndex(input)
    //matches := (mat != nil)
    matches := pat.MatchString(input)

    if match != matches {
      fmt.Printf("FAIL for %s (should be %t)\n", input, matches)
      return
    }
  }
}

func BenchmarkGenerateRegexp(b *testing.B) {
  for i := 0; i < b.N; i++ {
    generateRegexp(1, 500)
  }
}

func BenchmarkRegexpNot(b *testing.B) {
  lower, upper := 1, 500
  regex := generateRegexp(lower, upper)
  tests := generateTests(lower, upper)
  re, err := regexp.Compile(regex)
  if err != nil {
    b.Fatal("regexp.Compile:", err)
  }
  for i := 0; i < b.N; i++ {
    run_test(re, tests)
  }
}

06 print regex for (1,3)

echo "print regex for (1,3): meisterluk\n"
go run 06.go
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54
Go
§
package main

import (
  "bytes"
  "fmt"
  "strconv"
  "strings"
)

// Generate a regex accepting any string but the string of concatenated numbers
// lower_bound to upper_bound (each inclusive).
func generateRegexp(lower_bound, upper_bound int) string {
  var not_str bytes.Buffer
  var buf bytes.Buffer

  // generate not_string
  for number := lower_bound; number <= upper_bound; number++ {
    not_str.WriteString(strconv.Itoa(number))
  }
  not_string := not_str.String()
  l := len(not_string)

  buf.WriteString("^(")
  buf.WriteString(strings.Repeat(".?", l-1))
  buf.WriteString("|")

  // strings with length equal to not_string
  for index := 0; index < l; index++ {
    buf.WriteString(strings.Repeat(".", index))
    buf.WriteString("[^")
    buf.WriteString(string(not_string[index]))
    buf.WriteString("]")
    buf.WriteString(strings.Repeat(".", l-index-1))
    if index != l-1 {
      buf.WriteString("|")
    }
  }

  buf.WriteString("|")

  // strings with length greater than not_string

  buf.WriteString(strings.Repeat(".", l))
  buf.WriteString(".+")

  buf.WriteString(")$")

  return buf.String()
}

func main() {
  fmt.Println(generateRegexp(1, 3))
}

07 meisterluk with fmt.Fprint

echo "meisterluk, fmt.Fprint\n"
go test -bench GenerateRegexp 07_test.go
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108
Go
§
package main

import (
  "bytes"
  "fmt"
  "regexp"
  "strings"
  "testing"
)

// Generate a regex accepting any string but the string of concatenated numbers
// lower_bound to upper_bound (each inclusive).
func generateRegexp(lower_bound, upper_bound int) string {
  not_str := &bytes.Buffer{}
  buf := &bytes.Buffer{}

  // generate not_string
  for number := lower_bound; number <= upper_bound; number++ {
    fmt.Fprint(not_str, number)
  }
  not_string := not_str.String()
  l := len(not_string)

  fmt.Fprint(buf, "^(")
  fmt.Fprint(buf, strings.Repeat(".?", l-1))
  fmt.Fprint(buf, "|")

  // strings with length equal to not_string
  for index := 0; index < l; index++ {
    fmt.Fprint(buf, strings.Repeat(".", index))
    fmt.Fprint(buf, "[^")
    fmt.Fprint(buf, string(not_string[index]))
    fmt.Fprint(buf, "]")
    fmt.Fprint(buf, strings.Repeat(".", l-index-1))
    if index != l-1 {
      fmt.Fprint(buf, "|")
    }
  }

  fmt.Fprint(buf, "|")

  // strings with length greater than not_string

  fmt.Fprint(buf, strings.Repeat(".", l))
  fmt.Fprint(buf, ".+")

  fmt.Fprint(buf, ")$")

  return buf.String()
}

// Generate less than `(upper_bound - lower_bound)^2` input strings.
// Returns testsuite dict.
func generateTests(lower_bound, upper_bound int) map[string]bool {
  testsuite := make(map[string]bool)

  // generate not_string
  not_str := &bytes.Buffer{}
  for number := lower_bound; number <= upper_bound; number++ {
    fmt.Fprint(not_str, number)
  }
  not_string := not_str.String()

  // generate strings
  for begin := lower_bound; begin <= upper_bound; begin++ {
    for end := begin + 1; end <= upper_bound; end++ {
      var input_string = not_string[begin:end]
      matches := (input_string != not_string)
      testsuite[input_string] = matches
    }
  }

  return testsuite
}

// Actually run all pattern searches.
func run_test(pat *regexp.Regexp, testsuite map[string]bool) {
  for input, match := range testsuite {
    //mat := pat.FindStringIndex(input)
    //matches := (mat != nil)
    matches := pat.MatchString(input)

    if match != matches {
      fmt.Printf("FAIL for %s (should be %t)\n", input, matches)
      return
    }
  }
}

func BenchmarkGenerateRegexp(b *testing.B) {
  for i := 0; i < b.N; i++ {
    generateRegexp(1, 500)
  }
}

func BenchmarkRegexpNot(b *testing.B) {
  lower, upper := 1, 500
  regex := generateRegexp(lower, upper)
  tests := generateTests(lower, upper)
  re, err := regexp.Compile(regex)
  if err != nil {
    b.Fatal("regexp.Compile:", err)
  }
  for i := 0; i < b.N; i++ {
    run_test(re, tests)
  }
}

08 merge tux21b regex generation

1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92
Go
§
package main

import (
  "bytes"
  "fmt"
  "regexp"
  "testing"
  "unicode/utf8"
)

func generateString(lower, upper int) []byte {
  buf := &bytes.Buffer{}
  for i := lower; i <= upper; i++ {
    fmt.Fprint(buf, i)
  }
  return buf.Bytes()
}

func generateRegexp(str []byte) string {
  buf := &bytes.Buffer{}
  fmt.Fprint(buf, "^")
  build(buf, str)
  return buf.String()
}

func build(buf *bytes.Buffer, str []byte) {
  if len(str) == 0 {
    fmt.Fprint(buf, ".+")
    return
  }
  r, n := utf8.DecodeRune(str)
  fmt.Fprintf(buf, "(?:(?:%c", r)
  build(buf, str[n:])
  fmt.Fprintf(buf, ")|[^%c]|$)", r)
}

// Generate less than `(upper_bound - lower_bound)^2` input strings.
// Returns testsuite dict.
func generateTests(lower_bound, upper_bound int) map[string]bool {
  testsuite := make(map[string]bool)

  // generate not_string
  not_str := &bytes.Buffer{}
  for number := lower_bound; number <= upper_bound; number++ {
    fmt.Fprint(not_str, number)
  }
  not_string := generateString(lower_bound, upper_bound)

  // generate strings
  for begin := lower_bound; begin <= upper_bound; begin++ {
    for end := begin + 1; end <= upper_bound; end++ {
      var input_string = not_string[begin:end]
      testsuite[string(input_string)] = !bytes.Equal(input_string, not_string)
    }
  }

  return testsuite
}

// Actually run all pattern searches.
func run_test(pat *regexp.Regexp, testsuite map[string]bool) {
  for input, match := range testsuite {
    //mat := pat.FindStringIndex(input)
    //matches := (mat != nil)
    matches := pat.MatchString(input)

    if match != matches {
      fmt.Printf("FAIL for %s (should be %t)\n", input, matches)
      return
    }
  }
}

func BenchmarkGenerateRegexp(b *testing.B) {
  for i := 0; i < b.N; i++ {
    generateRegexp(generateString(1, 500))
  }
}

func BenchmarkRegexpNot(b *testing.B) {
  lower, upper := 1, 500
  regex := generateRegexp(generateString(lower, upper))
  tests := generateTests(lower, upper)
  re, err := regexp.Compile(regex)
  if err != nil {
    b.Fatal("regexp.Compile:", err)
  }
  for i := 0; i < b.N; i++ {
    run_test(re, tests)
  }
}

09 Test struct

echo "meisterluk, Test struct\n"
go test -bench RegexpNot 09_test.go
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82
Go
§
package main

import (
  "bytes"
  "fmt"
  "regexp"
  "testing"
  "unicode/utf8"
)

func generateString(lower, upper int) []byte {
  buf := &bytes.Buffer{}
  for i := lower; i <= upper; i++ {
    fmt.Fprint(buf, i)
  }
  return buf.Bytes()
}

func generateRegexp(str []byte) string {
  buf := &bytes.Buffer{}
  fmt.Fprint(buf, "^")
  build(buf, str)
  return buf.String()
}

func build(buf *bytes.Buffer, str []byte) {
  if len(str) == 0 {
    fmt.Fprint(buf, ".+")
    return
  }
  r, n := utf8.DecodeRune(str)
  fmt.Fprintf(buf, "(?:(?:%c", r)
  build(buf, str[n:])
  fmt.Fprintf(buf, ")|[^%c]|$)", r)
}

type Test struct {
  Pattern string
  Match   bool
}

func generateTests(lower, upper int) (tests []Test) {
  str := generateString(lower, upper)
  for i := lower; i <= upper; i++ {
    for j := i + 1; j <= upper; j++ {
      tests = append(tests, Test{
        Pattern: string(str[i:j]),
        Match:   !bytes.Equal(str[i:j], str),
      })
    }
  }
  return
}

// Actually run all pattern searches.
func run_test(b *testing.B, re *regexp.Regexp, tests []Test) {
  for _, t := range tests {
    if m := re.MatchString(t.Pattern); m != t.Match {
      b.Errorf("Test %q: expected %v, got %v", t.Pattern, t.Match, m)
    }
  }
}

func BenchmarkGenerateRegexp(b *testing.B) {
  for i := 0; i < b.N; i++ {
    generateRegexp(generateString(1, 500))
  }
}

func BenchmarkRegexpNot(b *testing.B) {
  lower, upper := 1, 500
  regex := generateRegexp(generateString(lower, upper))
  tests := generateTests(lower, upper)
  re, err := regexp.Compile(regex)
  if err != nil {
    b.Fatal("regexp.Compile:", err)
  }
  for i := 0; i < b.N; i++ {
    run_test(b, re, tests)
  }
}

10 strings

echo "meisterluk, strings as regex engine input\n"
go test -bench RegexpNot 10_test.go
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82
Go
§
package main

import (
  "bytes"
  "fmt"
  "regexp"
  "testing"
  "unicode/utf8"
)

func generateString(lower, upper int) []byte {
  buf := &bytes.Buffer{}
  for i := lower; i <= upper; i++ {
    fmt.Fprint(buf, i)
  }
  return buf.Bytes()
}

func generateRegexp(str []byte) string {
  buf := &bytes.Buffer{}
  fmt.Fprint(buf, "^")
  build(buf, str)
  return buf.String()
}

func build(buf *bytes.Buffer, str []byte) {
  if len(str) == 0 {
    fmt.Fprint(buf, ".+")
    return
  }
  r, n := utf8.DecodeRune(str)
  fmt.Fprintf(buf, "(?:(?:%c", r)
  build(buf, str[n:])
  fmt.Fprintf(buf, ")|[^%c]|$)", r)
}

type Test struct {
  Pattern []byte
  Match   bool
}

func generateTests(lower, upper int) (tests []Test) {
  str := generateString(lower, upper)
  for i := lower; i <= upper; i++ {
    for j := i + 1; j <= upper; j++ {
      tests = append(tests, Test{
        Pattern: str[i:j],
        Match:   !bytes.Equal(str[i:j], str),
      })
    }
  }
  return
}

// Actually run all pattern searches.
func run_test(b *testing.B, re *regexp.Regexp, tests []Test) {
  for _, t := range tests {
    if m := re.Match(t.Pattern); m != t.Match {
      b.Errorf("Test %q: expected %v, got %v", t.Pattern, t.Match, m)
    }
  }
}

func BenchmarkGenerateRegexp(b *testing.B) {
  for i := 0; i < b.N; i++ {
    generateRegexp(generateString(1, 500))
  }
}

func BenchmarkRegexpNot(b *testing.B) {
  lower, upper := 1, 500
  regex := generateRegexp(generateString(lower, upper))
  tests := generateTests(lower, upper)
  re, err := regexp.Compile(regex)
  if err != nil {
    b.Fatal("regexp.Compile:", err)
  }
  for i := 0; i < b.N; i++ {
    run_test(b, re, tests)
  }
}

11 remove run_test

echo "meisterluk, run_test removed\n"
go test -bench RegexpNot 11_test.go
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77
Go
§
package main

import (
  "bytes"
  "fmt"
  "regexp"
  "testing"
  "unicode/utf8"
)

func generateString(lower, upper int) []byte {
  buf := &bytes.Buffer{}
  for i := lower; i <= upper; i++ {
    fmt.Fprint(buf, i)
  }
  return buf.Bytes()
}

func generateRegexp(str []byte) string {
  buf := &bytes.Buffer{}
  fmt.Fprint(buf, "^")
  build(buf, str)
  return buf.String()
}

func build(buf *bytes.Buffer, str []byte) {
  if len(str) == 0 {
    fmt.Fprint(buf, ".+")
    return
  }
  r, n := utf8.DecodeRune(str)
  fmt.Fprintf(buf, "(?:(?:%c", r)
  build(buf, str[n:])
  fmt.Fprintf(buf, ")|[^%c]|$)", r)
}

type Test struct {
  Pattern []byte
  Match   bool
}

func generateTests(lower, upper int) (tests []Test) {
  str := generateString(lower, upper)
  for i := lower; i <= upper; i++ {
    for j := i + 1; j <= upper; j++ {
      tests = append(tests, Test{
        Pattern: str[i:j],
        Match:   !bytes.Equal(str[i:j], str),
      })
    }
  }
  return
}

func BenchmarkGenerateRegexp(b *testing.B) {
  for i := 0; i < b.N; i++ {
    generateRegexp(generateString(1, 500))
  }
}

func BenchmarkRegexpNot(b *testing.B) {
  lower, upper := 1, 500
  regex := generateRegexp(generateString(lower, upper))
  tests := generateTests(lower, upper)
  re, err := regexp.Compile(regex)
  if err != nil {
    b.Fatal("regexp.Compile:", err)
  }
  for i := 0; i < b.N; i++ {
    for _, t := range tests {
      if m := re.Match(t.Pattern); m != t.Match {
        b.Errorf("Test %q: expected %v, got %v", t.Pattern, t.Match, m)
      }
    }
  }
}

92 unmatched groups

echo "tux21b, unmatched groups with memory statistics"
go test -bench GenerateRegexp 92_test.go -benchmem
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109
Go
§
package main

// courtesy of tux21b

import (
  "bytes"
  "fmt"
  "regexp"
  "strconv"
  "testing"
  "unicode/utf8"
)

func generateString(lower, upper int) []byte {
  buf := &bytes.Buffer{}
  for i := lower; i <= upper; i++ {
    buf.WriteString(strconv.Itoa(i))
  }
  return buf.Bytes()
}

func generateRegexp(str []byte) string {
  buf := &bytes.Buffer{}
  fmt.Fprint(buf, "^")
  build(buf, str)
  return buf.String()
}

func build(buf *bytes.Buffer, str []byte) {
  if len(str) == 0 {
    fmt.Fprint(buf, ".+")
    return
  }
  r, n := utf8.DecodeRune(str)
  fmt.Fprintf(buf, "((%c", r)
  build(buf, str[n:])
  fmt.Fprintf(buf, ")|[^%c]|$)", r)
}

type Test struct {
  Pattern []byte
  Match   bool
}

func generateTests(lower, upper int) (tests []Test) {
  str := generateString(lower, upper)
  for i := lower; i <= upper; i++ {
    for j := i + 1; j <= upper; j++ {
      tests = append(tests, Test{
        Pattern: str[i:j],
        Match:   !bytes.Equal(str[i:j], str),
      })
    }
  }
  return
}

var simpleTests = []Test{
  {[]byte("ab"), true},
  {[]byte(""), true},
  {[]byte("4"), true},
  {[]byte("0"), true},
  {[]byte("02"), true},
  {[]byte("11"), true},
  {[]byte("0123"), true},
  {[]byte("01234"), false},
  {[]byte("012a4"), true},
  {[]byte("012345"), true},
}

func TestRegexpNot(t *testing.T) {
  str := generateString(0, 4)
  t.Log("Regexp:", generateRegexp(str))
  re, err := regexp.Compile(generateRegexp(str))
  if err != nil {
    t.Fatal("regexp.Compile:", err)
  }
  for _, test := range simpleTests {
    if m := re.Match(test.Pattern); m != test.Match {
      t.Errorf("Test %q: expected %v, got %v.", test.Pattern, test.Match, m)
    }
  }
}

func BenchmarkGenerateRegexp(b *testing.B) {
  for i := 0; i < b.N; i++ {
    generateRegexp(generateString(1, 500))
  }
}

func BenchmarkRegexpNot(b *testing.B) {
  b.StopTimer()
  lower, upper := 1, 500
  regex := generateRegexp(generateString(lower, upper))
  tests := generateTests(lower, upper)
  re, err := regexp.Compile(regex)
  if err != nil {
    b.Fatal("regexp.Compile:", err)
  }
  b.StartTimer()
  for i := 0; i < b.N; i++ {
    for _, t := range tests {
      if m := re.Match(t.Pattern); m != t.Match {
        b.Errorf("Test %q: expected %v, got %v", t.Pattern, t.Match, m)
      }
    }
  }
}

93 tux21b with strconv.Itoa

echo "tux21b, strconv.Itoa\n"
go test -bench GenerateRegexp 93_test.go
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109
Go
§
package main

// courtesy of tux21b

import (
  "bytes"
  "fmt"
  "regexp"
  "strconv"
  "testing"
  "unicode/utf8"
)

func generateString(lower, upper int) []byte {
  buf := &bytes.Buffer{}
  for i := lower; i <= upper; i++ {
    buf.WriteString(strconv.Itoa(i))
  }
  return buf.Bytes()
}

func generateRegexp(str []byte) string {
  buf := &bytes.Buffer{}
  fmt.Fprint(buf, "^")
  build(buf, str)
  return buf.String()
}

func build(buf *bytes.Buffer, str []byte) {
  if len(str) == 0 {
    fmt.Fprint(buf, ".+")
    return
  }
  r, n := utf8.DecodeRune(str)
  fmt.Fprintf(buf, "(?:(?:%c", r)
  build(buf, str[n:])
  fmt.Fprintf(buf, ")|[^%c]|$)", r)
}

type Test struct {
  Pattern []byte
  Match   bool
}

func generateTests(lower, upper int) (tests []Test) {
  str := generateString(lower, upper)
  for i := lower; i <= upper; i++ {
    for j := i + 1; j <= upper; j++ {
      tests = append(tests, Test{
        Pattern: str[i:j],
        Match:   !bytes.Equal(str[i:j], str),
      })
    }
  }
  return
}

var simpleTests = []Test{
  {[]byte("ab"), true},
  {[]byte(""), true},
  {[]byte("4"), true},
  {[]byte("0"), true},
  {[]byte("02"), true},
  {[]byte("11"), true},
  {[]byte("0123"), true},
  {[]byte("01234"), false},
  {[]byte("012a4"), true},
  {[]byte("012345"), true},
}

func TestRegexpNot(t *testing.T) {
  str := generateString(0, 4)
  t.Log("Regexp:", generateRegexp(str))
  re, err := regexp.Compile(generateRegexp(str))
  if err != nil {
    t.Fatal("regexp.Compile:", err)
  }
  for _, test := range simpleTests {
    if m := re.Match(test.Pattern); m != test.Match {
      t.Errorf("Test %q: expected %v, got %v.", test.Pattern, test.Match, m)
    }
  }
}

func BenchmarkGenerateRegexp(b *testing.B) {
  for i := 0; i < b.N; i++ {
    generateRegexp(generateString(1, 500))
  }
}

func BenchmarkRegexpNot(b *testing.B) {
  b.StopTimer()
  lower, upper := 1, 500
  regex := generateRegexp(generateString(lower, upper))
  tests := generateTests(lower, upper)
  re, err := regexp.Compile(regex)
  if err != nil {
    b.Fatal("regexp.Compile:", err)
  }
  b.StartTimer()
  for i := 0; i < b.N; i++ {
    for _, t := range tests {
      if m := re.Match(t.Pattern); m != t.Match {
        b.Errorf("Test %q: expected %v, got %v", t.Pattern, t.Match, m)
      }
    }
  }
}

94 print regex for (1,3)

echo "print regex for (1,3): tux21b\n"
go run 94.go
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40
Go
§
package main

// courtesy of tux21b

import (
  "bytes"
  "fmt"
  "unicode/utf8"
)

func generateString(lower, upper int) []byte {
  buf := &bytes.Buffer{}
  for i := lower; i <= upper; i++ {
    fmt.Fprint(buf, i)
  }
  return buf.Bytes()
}

func generateRegexp(str []byte) string {
  buf := &bytes.Buffer{}
  fmt.Fprint(buf, "^")
  build(buf, str)
  return buf.String()
}

func build(buf *bytes.Buffer, str []byte) {
  if len(str) == 0 {
    fmt.Fprint(buf, ".+")
    return
  }
  r, n := utf8.DecodeRune(str)
  fmt.Fprintf(buf, "(?:(?:%c", r)
  build(buf, str[n:])
  fmt.Fprintf(buf, ")|[^%c]|$)", r)
}

func main() {
  fmt.Println(generateRegexp(generateString(1, 3)))
}

95 BenchmarkGenerateRegexp

echo "benchmarking regex generation: tux21b, testing\n"
go test -bench GenerateRegexp 95_test.go
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108
Go
§
package main

// courtesy of tux21b

import (
  "bytes"
  "fmt"
  "regexp"
  "testing"
  "unicode/utf8"
)

func generateString(lower, upper int) []byte {
  buf := &bytes.Buffer{}
  for i := lower; i <= upper; i++ {
    fmt.Fprint(buf, i)
  }
  return buf.Bytes()
}

func generateRegexp(str []byte) string {
  buf := &bytes.Buffer{}
  fmt.Fprint(buf, "^")
  build(buf, str)
  return buf.String()
}

func build(buf *bytes.Buffer, str []byte) {
  if len(str) == 0 {
    fmt.Fprint(buf, ".+")
    return
  }
  r, n := utf8.DecodeRune(str)
  fmt.Fprintf(buf, "(?:(?:%c", r)
  build(buf, str[n:])
  fmt.Fprintf(buf, ")|[^%c]|$)", r)
}

type Test struct {
  Pattern []byte
  Match   bool
}

func generateTests(lower, upper int) (tests []Test) {
  str := generateString(lower, upper)
  for i := lower; i <= upper; i++ {
    for j := i + 1; j <= upper; j++ {
      tests = append(tests, Test{
        Pattern: str[i:j],
        Match:   !bytes.Equal(str[i:j], str),
      })
    }
  }
  return
}

var simpleTests = []Test{
  {[]byte("ab"), true},
  {[]byte(""), true},
  {[]byte("4"), true},
  {[]byte("0"), true},
  {[]byte("02"), true},
  {[]byte("11"), true},
  {[]byte("0123"), true},
  {[]byte("01234"), false},
  {[]byte("012a4"), true},
  {[]byte("012345"), true},
}

func TestRegexpNot(t *testing.T) {
  str := generateString(0, 4)
  t.Log("Regexp:", generateRegexp(str))
  re, err := regexp.Compile(generateRegexp(str))
  if err != nil {
    t.Fatal("regexp.Compile:", err)
  }
  for _, test := range simpleTests {
    if m := re.Match(test.Pattern); m != test.Match {
      t.Errorf("Test %q: expected %v, got %v.", test.Pattern, test.Match, m)
    }
  }
}

func BenchmarkGenerateRegexp(b *testing.B) {
  for i := 0; i < b.N; i++ {
    generateRegexp(generateString(1, 500))
  }
}

func BenchmarkRegexpNot(b *testing.B) {
  b.StopTimer()
  lower, upper := 1, 500
  regex := generateRegexp(generateString(lower, upper))
  tests := generateTests(lower, upper)
  re, err := regexp.Compile(regex)
  if err != nil {
    b.Fatal("regexp.Compile:", err)
  }
  b.StartTimer()
  for i := 0; i < b.N; i++ {
    for _, t := range tests {
      if m := re.Match(t.Pattern); m != t.Match {
        b.Errorf("Test %q: expected %v, got %v", t.Pattern, t.Match, m)
      }
    }
  }
}

96 pprof for 98

echo "tux21b, overall path, runtime/pprof"
echo "Generates a 96.prof file\n"
go build 96.go
./96
go tool pprof --web 96 96.prof
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106
Go
§
package main

// courtesy of tux21b

import (
  "bytes"
  "fmt"
  "os"
  "regexp"
  "runtime/pprof"
  "unicode/utf8"
)

func generateString(lower, upper int) []byte {
  buf := &bytes.Buffer{}
  for i := lower; i <= upper; i++ {
    fmt.Fprint(buf, i)
  }
  return buf.Bytes()
}

func generateRegexp(str []byte) string {
  buf := &bytes.Buffer{}
  fmt.Fprint(buf, "^")
  build(buf, str)
  return buf.String()
}

func build(buf *bytes.Buffer, str []byte) {
  if len(str) == 0 {
    fmt.Fprint(buf, ".+")
    return
  }
  r, n := utf8.DecodeRune(str)
  fmt.Fprintf(buf, "(?:(?:%c", r)
  build(buf, str[n:])
  fmt.Fprintf(buf, ")|[^%c]|$)", r)
}

type Test struct {
  Pattern []byte
  Match   bool
}

func generateTests(lower, upper int) (tests []Test) {
  str := generateString(lower, upper)
  for i := lower; i <= upper; i++ {
    for j := i + 1; j <= upper; j++ {
      tests = append(tests, Test{
        Pattern: str[i:j],
        Match:   !bytes.Equal(str[i:j], str),
      })
    }
  }
  return
}

var simpleTests = []Test{
  {[]byte("ab"), true},
  {[]byte(""), true},
  {[]byte("4"), true},
  {[]byte("0"), true},
  {[]byte("02"), true},
  {[]byte("11"), true},
  {[]byte("0123"), true},
  {[]byte("01234"), false},
  {[]byte("012a4"), true},
  {[]byte("012345"), true},
}

func TestRegexpNot() {
  str := generateString(0, 4)
  panic(fmt.Sprintf("Regexp:", generateRegexp(str)))
  re, err := regexp.Compile(generateRegexp(str))
  if err != nil {
    panic(fmt.Sprintf("regexp.Compile:", err))
  }
  for _, test := range simpleTests {
    if m := re.Match(test.Pattern); m != test.Match {
      panic(fmt.Sprintf("Test %q: expected %v, got %v.", test.Pattern, test.Match, m))
    }
  }
}

func main() {
  f, err := os.Create("96.prof")
  if err != nil {
    panic(err)
  }
  pprof.StartCPUProfile(f)
  defer pprof.StopCPUProfile()

  lower, upper := 1, 500
  regex := generateRegexp(generateString(lower, upper))
  tests := generateTests(lower, upper)
  re, err := regexp.Compile(regex)
  if err != nil {
    panic(fmt.Sprintf("regexp.Compile:", err))
  }
  for _, t := range tests {
    if m := re.Match(t.Pattern); m != t.Match {
      panic(fmt.Sprintf("Test %q: expected %v, got %v", t.Pattern, t.Match, m))
    }
  }
}

97 tux21b source code with time

echo "tux21b, overall + critical path, time\n"
go run 97.go
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111
Go
§
package main

// courtesy of tux21b

import (
  "bytes"
  "fmt"
  "regexp"
  "time"
  "unicode/utf8"
)

func generateString(lower, upper int) []byte {
  buf := &bytes.Buffer{}
  for i := lower; i <= upper; i++ {
    fmt.Fprint(buf, i)
  }
  return buf.Bytes()
}

func generateRegexp(str []byte) string {
  buf := &bytes.Buffer{}
  fmt.Fprint(buf, "^")
  build(buf, str)
  return buf.String()
}

func build(buf *bytes.Buffer, str []byte) {
  if len(str) == 0 {
    fmt.Fprint(buf, ".+")
    return
  }
  r, n := utf8.DecodeRune(str)
  fmt.Fprintf(buf, "(?:(?:%c", r)
  build(buf, str[n:])
  fmt.Fprintf(buf, ")|[^%c]|$)", r)
}

type Test struct {
  Pattern []byte
  Match   bool
}

func generateTests(lower, upper int) (tests []Test) {
  str := generateString(lower, upper)
  for i := lower; i <= upper; i++ {
    for j := i + 1; j <= upper; j++ {
      tests = append(tests, Test{
        Pattern: str[i:j],
        Match:   !bytes.Equal(str[i:j], str),
      })
    }
  }
  return
}

var simpleTests = []Test{
  {[]byte("ab"), true},
  {[]byte(""), true},
  {[]byte("4"), true},
  {[]byte("0"), true},
  {[]byte("02"), true},
  {[]byte("11"), true},
  {[]byte("0123"), true},
  {[]byte("01234"), false},
  {[]byte("012a4"), true},
  {[]byte("012345"), true},
}

func TestRegexpNot() {
  str := generateString(0, 4)
  panic(fmt.Sprintf("Regexp:", generateRegexp(str)))
  re, err := regexp.Compile(generateRegexp(str))
  if err != nil {
    panic(fmt.Sprintf("regexp.Compile:", err))
  }
  for _, test := range simpleTests {
    if m := re.Match(test.Pattern); m != test.Match {
      panic(fmt.Sprintf("Test %q: expected %v, got %v.", test.Pattern, test.Match, m))
    }
  }
}

// A helper for measuring time
func trackingTime(begin time.Time) {
  elapsed := time.Since(begin)
  fmt.Printf("It takes %s to test all input strings.", elapsed)
}

// Actually run all pattern searches.
func run_test(re *regexp.Regexp, tests []Test) {
  defer trackingTime(time.Now())

  for _, t := range tests {
    if m := re.Match(t.Pattern); m != t.Match {
      panic(fmt.Sprintf("Test %q: expected %v, got %v", t.Pattern, t.Match, m))
    }
  }
}

func main() {
  lower, upper := 1, 500
  regex := generateRegexp(generateString(lower, upper))
  tests := generateTests(lower, upper)
  re, err := regexp.Compile(regex)
  if err != nil {
    panic(fmt.Sprintf("regexp.Compile:", err))
  }
  run_test(re, tests)
}

98 removed benchmarking from 99

echo "meisterluk, overall path\n"
go run 98.go
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97
Go
§
package main

// courtesy of tux21b

import (
  "bytes"
  "fmt"
  "regexp"
  "unicode/utf8"
)

func generateString(lower, upper int) []byte {
  buf := &bytes.Buffer{}
  for i := lower; i <= upper; i++ {
    fmt.Fprint(buf, i)
  }
  return buf.Bytes()
}

func generateRegexp(str []byte) string {
  buf := &bytes.Buffer{}
  fmt.Fprint(buf, "^")
  build(buf, str)
  return buf.String()
}

func build(buf *bytes.Buffer, str []byte) {
  if len(str) == 0 {
    fmt.Fprint(buf, ".+")
    return
  }
  r, n := utf8.DecodeRune(str)
  fmt.Fprintf(buf, "(?:(?:%c", r)
  build(buf, str[n:])
  fmt.Fprintf(buf, ")|[^%c]|$)", r)
}

type Test struct {
  Pattern []byte
  Match   bool
}

func generateTests(lower, upper int) (tests []Test) {
  str := generateString(lower, upper)
  for i := lower; i <= upper; i++ {
    for j := i + 1; j <= upper; j++ {
      tests = append(tests, Test{
        Pattern: str[i:j],
        Match:   !bytes.Equal(str[i:j], str),
      })
    }
  }
  return
}

var simpleTests = []Test{
  {[]byte("ab"), true},
  {[]byte(""), true},
  {[]byte("4"), true},
  {[]byte("0"), true},
  {[]byte("02"), true},
  {[]byte("11"), true},
  {[]byte("0123"), true},
  {[]byte("01234"), false},
  {[]byte("012a4"), true},
  {[]byte("012345"), true},
}

func TestRegexpNot() {
  str := generateString(0, 4)
  panic(fmt.Sprintf("Regexp:", generateRegexp(str)))
  re, err := regexp.Compile(generateRegexp(str))
  if err != nil {
    panic(fmt.Sprintf("regexp.Compile:", err))
  }
  for _, test := range simpleTests {
    if m := re.Match(test.Pattern); m != test.Match {
      panic(fmt.Sprintf("Test %q: expected %v, got %v.", test.Pattern, test.Match, m))
    }
  }
}

func main() {
  lower, upper := 1, 500
  regex := generateRegexp(generateString(lower, upper))
  tests := generateTests(lower, upper)
  re, err := regexp.Compile(regex)
  if err != nil {
    panic(fmt.Sprintf("regexp.Compile:", err))
  }
  for _, t := range tests {
    if m := re.Match(t.Pattern); m != t.Match {
      panic(fmt.Sprintf("Test %q: expected %v, got %v", t.Pattern, t.Match, m))
    }
  }
}

99 tux21b's version

echo "tux21b, overall + critical path, testing\n"
go test -bench 'RegexpNot' 99_test.go
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102
Go
§
package main

// courtesy of tux21b

import (
  "bytes"
  "fmt"
  "regexp"
  "testing"
  "unicode/utf8"
)

func generateString(lower, upper int) []byte {
  buf := &bytes.Buffer{}
  for i := lower; i <= upper; i++ {
    fmt.Fprint(buf, i)
  }
  return buf.Bytes()
}

func generateRegexp(str []byte) string {
  buf := &bytes.Buffer{}
  fmt.Fprint(buf, "^")
  build(buf, str)
  return buf.String()
}

func build(buf *bytes.Buffer, str []byte) {
  if len(str) == 0 {
    fmt.Fprint(buf, ".+")
    return
  }
  r, n := utf8.DecodeRune(str)
  fmt.Fprintf(buf, "(?:(?:%c", r)
  build(buf, str[n:])
  fmt.Fprintf(buf, ")|[^%c]|$)", r)
}

type Test struct {
  Pattern []byte
  Match   bool
}

func generateTests(lower, upper int) (tests []Test) {
  str := generateString(lower, upper)
  for i := lower; i <= upper; i++ {
    for j := i + 1; j <= upper; j++ {
      tests = append(tests, Test{
        Pattern: str[i:j],
        Match:   !bytes.Equal(str[i:j], str),
      })
    }
  }
  return
}

var simpleTests = []Test{
  {[]byte("ab"), true},
  {[]byte(""), true},
  {[]byte("4"), true},
  {[]byte("0"), true},
  {[]byte("02"), true},
  {[]byte("11"), true},
  {[]byte("0123"), true},
  {[]byte("01234"), false},
  {[]byte("012a4"), true},
  {[]byte("012345"), true},
}

func TestRegexpNot(t *testing.T) {
  str := generateString(0, 4)
  t.Log("Regexp:", generateRegexp(str))
  re, err := regexp.Compile(generateRegexp(str))
  if err != nil {
    t.Fatal("regexp.Compile:", err)
  }
  for _, test := range simpleTests {
    if m := re.Match(test.Pattern); m != test.Match {
      t.Errorf("Test %q: expected %v, got %v.", test.Pattern, test.Match, m)
    }
  }
}

func BenchmarkRegexpNot(b *testing.B) {
  b.StopTimer()
  lower, upper := 1, 500
  regex := generateRegexp(generateString(lower, upper))
  tests := generateTests(lower, upper)
  re, err := regexp.Compile(regex)
  if err != nil {
    b.Fatal("regexp.Compile:", err)
  }
  b.StartTimer()
  for i := 0; i < b.N; i++ {
    for _, t := range tests {
      if m := re.Match(t.Pattern); m != t.Match {
        b.Errorf("Test %q: expected %v, got %v", t.Pattern, t.Match, m)
      }
    }
  }
}

tux21b regex with sre2

go run notstring.sre2_tux21b.go
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93
Go
§
package main

import (
  "bytes"
  "code.google.com/p/sre2/sre2"
  "fmt"
  "regexp"
  "unicode/utf8"
)

func generateString(lower, upper int) []byte {
  buf := &bytes.Buffer{}
  for i := lower; i <= upper; i++ {
    fmt.Fprint(buf, i)
  }
  return buf.Bytes()
}

func generateRegexp(str []byte) string {
  buf := &bytes.Buffer{}
  fmt.Fprint(buf, "^")
  build(buf, str)
  return buf.String()
}

func build(buf *bytes.Buffer, str []byte) {
  if len(str) == 0 {
    fmt.Fprint(buf, ".+")
    return
  }
  r, n := utf8.DecodeRune(str)
  fmt.Fprintf(buf, "(?:(?:%c", r)
  build(buf, str[n:])
  fmt.Fprintf(buf, ")|[^%c]|$)", r)
}

type Test struct {
  Pattern []byte
  Match   bool
}

func generateTests(lower, upper int) (tests []Test) {
  str := generateString(lower, upper)
  for i := lower; i <= upper; i++ {
    for j := i + 1; j <= upper; j++ {
      tests = append(tests, Test{
        Pattern: str[i:j],
        Match:   !bytes.Equal(str[i:j], str),
      })
    }
  }
  return
}

var simpleTests = []Test{
  {[]byte("ab"), true},
  {[]byte(""), true},
  {[]byte("4"), true},
  {[]byte("0"), true},
  {[]byte("02"), true},
  {[]byte("11"), true},
  {[]byte("0123"), true},
  {[]byte("01234"), false},
  {[]byte("012a4"), true},
  {[]byte("012345"), true},
}

func TestRegexpNot() {
  str := generateString(0, 4)
  panic(fmt.Sprintf("Regexp:", generateRegexp(str)))
  re, err := regexp.Compile(generateRegexp(str))
  if err != nil {
    panic(fmt.Sprintf("regexp.Compile:", err))
  }
  for _, test := range simpleTests {
    if m := re.Match(test.Pattern); m != test.Match {
      panic(fmt.Sprintf("Test %q: expected %v, got %v.", test.Pattern, test.Match, m))
    }
  }
}

func main() {
  lower, upper := 1, 500
  regex := generateRegexp(generateString(lower, upper))
  tests := generateTests(lower, upper)
  re := sre2.MustParse(regex)
  for _, t := range tests {
    if m := re.Match(string(t.Pattern)); m != t.Match {
      panic(fmt.Sprintf("Test %q: expected %v, got %v", t.Pattern, t.Match, m))
    }
  }
}

meisterluk regex with sre2

go run notstring.sre2_meisterluk.go
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142
Go
§
package main

import (
  "bytes"
  "code.google.com/p/sre2/sre2"
  "fmt"
  "os"
  "strconv"
  "strings"
  "time"
)

// Generate a regex accepting any string but the string of concatenated numbers
// lower_bound to upper_bound (each inclusive).
func generate_regex(lower_bound, upper_bound, mode int) string {
  var not_str bytes.Buffer
  var buf bytes.Buffer

  // generate not_string
  for number := lower_bound; number <= upper_bound; number++ {
    not_str.WriteString(strconv.Itoa(number))
  }
  not_string := not_str.String()
  l := len(not_string)

  // mode 3 = lookahead
  if mode == 3 {
    buf.WriteString("^(?!")
    buf.WriteString(not_string)
    buf.WriteString("$).*$")
    return buf.String()
  }

  buf.WriteString("^(")

  // strings with length smaller than not_string
  // mode 1 == |.|..|...|…
  if mode == 1 {
    for i := 0; i < l; i++ {
      buf.WriteString(strings.Repeat(".", i))
      if i != l-1 {
        buf.WriteString("|")
      }
    }
    // mode 2 == .?.?.?.?…
  } else if mode == 2 {
    buf.WriteString(strings.Repeat(".?", l-1))
  }

  buf.WriteString("|")

  // strings with length equal to not_string
  for index := 0; index < l; index++ {
    buf.WriteString(strings.Repeat(".", index))
    buf.WriteString("[^")
    buf.WriteString(string(not_string[index]))
    buf.WriteString("]")
    buf.WriteString(strings.Repeat(".", l-index-1))
    if index != l-1 {
      buf.WriteString("|")
    }
  }

  buf.WriteString("|")

  // strings with length greater than not_string

  buf.WriteString(strings.Repeat(".", l))
  buf.WriteString(".+")

  buf.WriteString(")$")

  return buf.String()
}

// Generate less than `(upper_bound - lower_bound)^2` input strings.
// Returns testsuite dict.
func generate_input_strings(lower_bound, upper_bound int) map[string]bool {
  testsuite := make(map[string]bool)

  // generate not_string
  var not_str bytes.Buffer
  for number := lower_bound; number <= upper_bound; number++ {
    not_str.WriteString(strconv.Itoa(number))
  }
  not_string := not_str.String()

  // generate strings
  for begin := lower_bound; begin <= upper_bound; begin++ {
    for end := begin + 1; end <= upper_bound; end++ {
      var input_string = not_string[begin:end]
      matches := (input_string != not_string)
      testsuite[input_string] = matches
    }
  }

  return testsuite
}

// A helper for measuring time
func trackingTime(begin time.Time) {
  elapsed := time.Since(begin)
  fmt.Printf("It takes %s to test all input strings.", elapsed)
}

// Actually run all pattern searches.
func run_test(pat sre2.Re, testsuite map[string]bool) {
  defer trackingTime(time.Now())

  for input, match := range testsuite {
    //mat := pat.FindStringIndex(input)
    //matches := (mat != nil)
    matches := pat.Match(input)

    if match != matches {
      fmt.Printf("FAIL for %s (should be %t)\n", input, matches)
      return
    }
  }
}

func main() {
  low := 1
  upp := 500

  regex := generate_regex(low, upp, 2)
  //fmt.Println(regex)
  //fmt.Println("\n\n\n\n")
  testsuite := generate_input_strings(low, upp)

  fmt.Printf("Generated regex of string length %d.\n", len(regex))
  fmt.Printf("Generated %d input strings.\n", len(testsuite))

  pat := sre2.MustParse(regex)

  fmt.Println("Regex compilation has finished.\n")

  run_test(pat, testsuite)

  os.Exit(0)
}

tux21b's version in python3

python3 notstring.performance.py
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106
Python
§
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
    Find a regex accepting any arbitrary string but the string with numbers 0 to 500.

    (C) 2015, Public Domain, meisterluk
"""

import re
import sys
import timeit

sys.setrecursionlimit(5583)

def generate_string(lower_bound, upper_bound):
    """Generate the string concatenating all values from lower_bound
    to upper_bound.
    """
    return ''.join(str(v) for v in range(lower_bound, upper_bound + 1))


def generate_regex(string):
    """Generate a regex matching all strings except the given one.
    This implements tux21b's regex, but uses slow unicode string operations.
    """
    #def build(s):
    #    if not s:
    #        return ".+"
    #    return "(?:(?:" + s[0] + build(s[1:]) + ")|[^" + s[0] + "]|$)"
    #return "^" + build(string)

    s = "^%"

    for c in string:
        t = "(?:(?:" + c + "%)|[^" + c + "]|$)"
        s = s.replace("%", t)

    return "^" + s.replace("%", ".+")


def generate_input_strings(lower_bound, upper_bound):
    """Generate less than `(upper_bound - lower_bound)^2` input strings.
    Returns testsuite dict.
    """
    testsuite = {}
    not_string = generate_string(lower_bound, upper_bound)

    for begin in range(lower_bound, upper_bound + 1):
        for end in range(begin + 1, upper_bound + 1):
            input_string = not_string[begin:end]
            matches = (input_string != not_string)
            testsuite[input_string] = matches

    return testsuite


def run_test(pat, testsuite):
    """Actually run all pattern searches. Does not use global variables."""
    for string, match in testsuite.items():
        assert bool(pat.search(string)) == match


PATTERN = None
TESTSUITE = None
def run_timeit_test():
    """Actually run all pattern searches. Uses global variables."""
    for string, match in TESTSUITE.items():
        assert bool(PATTERN.search(string)) == match


def main(low, upp):
    """Main routine"""
    global PATTERN
    global TESTSUITE

    # do not time measure that.
    # Initialization is not considered computation time.
    regex = generate_regex(generate_string(low, upp))
    testsuite = generate_input_strings(low, upp)
    print("Generated regex of string length {}.".format(len(regex)))
    print("Generated {} input strings.".format(len(testsuite)))
    pat = re.compile(regex, flags=re.S | re.U)
    print("Regex compilation has finished.")

    # Measure here
    #run_test(pat, testsuite)

    PATTERN = pat
    TESTSUITE = testsuite

    duration = timeit.timeit("run_timeit_test()",
        setup="from __main__ import run_timeit_test",
        number=100)
    print('It takes {} seconds to test all input strings.'.format(duration))

    return 0


if __name__ == '__main__':
    low = 1
    upp = int(sys.argv[1], 10) if len(sys.argv) > 1 else 500

    sys.exit(main(low, upp))

tux21b's version in Java

javac NotStringPerformance.java
java -Xss1361k NotStringPerformance
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123
Java
§
import java.util.HashMap;
import java.util.Map;
import java.util.Map.Entry;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class NotStringPerformance {

  public static final boolean lookahead = true;
  public static final int mode = 2;

  /**
   * Generate the string which must not be matched.
   * 
   * @param lower_bound
   *          the lower value x in x‖x+1‖…‖y
   * @param lower_bound
   *          the greater value y in x‖x+1‖…‖y
   * @return the string concatenation x‖x+1‖…‖y
   */
  public static String generateString(int lower_bound, int upper_bound) {
    String not_string = "";
    for (int number = lower_bound; number <= upper_bound; number++) {
      not_string += Integer.toString(number);
    }
    return not_string;
  }

  /**
   * Generate a regex accepting any string but the given string.
   * 
   * Be aware that here tux21b's version is implemented,
   * but unlike the original version it uses slow string
   * operations with Unicode support.
   * 
   * @param str
   *          the string not to match
   * @return the regular expression
   */
  public static String generateRegex(String str) {
    return "^" + build(str);
  }
  
  public static String build(String str) {
    if (str.length() == 0)
      return ".+";

    char c = str.charAt(0);
    return "(?:(?:" + c + build(str.substring(1)) + ")|[^" + c + "]|$)";
  }
  
  /**
   * Generate less than `(upper_bound - lower_bound)^2` input strings.
   * Returns testsuite map.
   * 
   * @param lower_bound
   * @param upper_bound
   * @return
   */
  public static Map<String, Boolean> generateInputStrings(int lower_bound, int upper_bound) {
    Map<String, Boolean> testsuite = new HashMap<String, Boolean>();
    String not_string = generateString(lower_bound, upper_bound);

    for (int begin = lower_bound; begin <= upper_bound; begin++) {
      for (int end = begin + 1; end <= upper_bound; end++) {
        String input = not_string.substring(begin, end);
        boolean matches = (!input.equals(not_string));
        testsuite.put(input, new Boolean(matches));
      }
    }

    return testsuite;
  }

  public static void runTest(Pattern pat, Map<String, Boolean> testsuite) {
    for (Entry<String, Boolean> e : testsuite.entrySet()) {
      String input = e.getKey();
      Matcher m = pat.matcher(input);

      if (!e.getValue().equals(m.find())) {
        String val = "true";
        if (!e.getValue()) {
          val = "false";
        }
        System.err.println("FAIL for '" + input + "' (should be " + val + ")");
      }
    }
  }

  public static void main(String[] argv) {
    int low = 1;
    int upp = 500;
    
    // do not time measure that.
    // Initialization is not considered computation time.
    String regex = generateRegex(generateString(low, upp));
    Map<String, Boolean> testsuite = generateInputStrings(low, upp);

    System.out.print("Generated regex of string length ");
    System.out.print(regex.length());
    System.out.println(".");

    System.out.print("Generated ");
    System.out.print(testsuite.size());
    System.out.println(" input strings.");

    Pattern pat = Pattern.compile(regex, Pattern.DOTALL);

    System.out.println("Regex compilation has finished.");

    // Measure here
    long start = System.currentTimeMillis();
    runTest(pat, testsuite);
    long end = System.currentTimeMillis();
    long diff = (end - start);

    System.out.print("It takes ");
    System.out.print(diff / 1000.0);
    System.out.println(" seconds to test all input strings.");
  }
}

tux21b's version in perl

perl -wT notstring.performance.pl
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98
Perl
§
#!/usr/bin/perl -wT

use strict;
use warnings;

# Replace a substring *search* with *replacement* inside *base*
sub replace {
    my($base, $search, $replacement) = @_;
    my $pos = index($base, $search);

    if ($pos > -1) {
        substr($base, $pos, length($search), $replacement);
    }

    return $base;
}

# Generate a string concatenating numbers of first argument to second argument
sub generate_string {
    my($lower_bound, $upper_bound) = @_;
    my $not_string = "";

    for my $number ($lower_bound..$upper_bound) {
        $not_string .= "" . $number;
    }

    return $not_string;
}

# Generate a regex matching all strings except the given one
# This implements tux21b's regex
sub generate_regex {
    my $regex = "^%";

    foreach my $c (split //, $_[0]) {
        my $t = "(?:(?:" . $c . "%)|[^" . $c . "]|\$)";
        $regex = replace($regex, "%", $t);
    }

    return replace($regex, "%", ".+");
}

# Generate less than `(upper_bound - lower_bound)^2` input strings.
# Returns testsuite dict.
sub generate_input_strings {
    my($lower_bound, $upper_bound) = @_;
    my %testsuite;

    # generate not_string
    my $not_string = "";
    for my $number ($lower_bound..$upper_bound) {
        $not_string .= "" . $number;
    }

    # generate strings
    for my $begin ($lower_bound..$upper_bound) {
        for my $end (($begin + 1)..$upper_bound) {
            my $input_string = substr($not_string, $begin, $end - $begin);
            my $matches = ($input_string ne $not_string);
            $testsuite{$input_string} = $matches;
        }
    }

    return %testsuite;
}


sub run_test {
    my($regex, %testsuite) = @_;

    for my $input (keys %testsuite) {
        my $matches = !!($input =~ $regex);
        if ($matches ne !!$testsuite{$input}) {
            printf "FAIL for %s (should be %s)\n", $input, $matches && "true" || "false";
        }
    }
}

sub main {
    my $low = 1;
    my $upp = 500;

    my $regex = generate_regex(generate_string($low, $upp));
    my %testsuite = generate_input_strings($low, $upp);

    printf "Generated regex of string length %d.\n", length $regex;
    printf "Generated %d input strings.\n", scalar keys %testsuite;

    print "Regex compilation not available.\n";

    my $begin = time;
    run_test($regex, %testsuite);
    my $end = time;

    print "It takes " . ($end - $begin) . " seconds to test all input strings.\n";
}

main();

re2dfa

go run re2dfa_use.go
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32
Go
§
package main

import (
  "fmt"
  "io/ioutil"
  "log"

  "github.com/opennota/re2dfa/codegen"
  "github.com/opennota/re2dfa/dfa"
  "github.com/opennota/re2dfa/nfa"
)

func main() {
  expr, err := ioutil.ReadFile("my_complex_regex")
  if err != nil {
    log.Fatal("regex invalid")
  }

  typed := "string"
  pkg := "main"
  fun := "regexpNot"

  nfanode, err := nfa.New(string(expr))
  if err != nil {
    log.Fatal(err)
  }

  node := dfa.NewFromNFA(nfanode)
  source := codegen.GoGenerate(node, pkg, fun, typed)
  fmt.Println(source)
}