gen.go 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713
  1. // Copyright 2012 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. // +build ignore
  5. package main
  6. // This program generates table.go and table_test.go based on the authoritative
  7. // public suffix list at https://publicsuffix.org/list/effective_tld_names.dat
  8. //
  9. // The version is derived from
  10. // https://api.github.com/repos/publicsuffix/list/commits?path=public_suffix_list.dat
  11. // and a human-readable form is at
  12. // https://github.com/publicsuffix/list/commits/master/public_suffix_list.dat
  13. //
  14. // To fetch a particular git revision, such as 5c70ccd250, pass
  15. // -url "https://raw.githubusercontent.com/publicsuffix/list/5c70ccd250/public_suffix_list.dat"
  16. // and -version "an explicit version string".
  17. import (
  18. "bufio"
  19. "bytes"
  20. "flag"
  21. "fmt"
  22. "go/format"
  23. "io"
  24. "io/ioutil"
  25. "net/http"
  26. "os"
  27. "regexp"
  28. "sort"
  29. "strings"
  30. "golang.org/x/net/idna"
  31. )
  32. const (
  33. // These sum of these four values must be no greater than 32.
  34. nodesBitsChildren = 9
  35. nodesBitsICANN = 1
  36. nodesBitsTextOffset = 15
  37. nodesBitsTextLength = 6
  38. // These sum of these four values must be no greater than 32.
  39. childrenBitsWildcard = 1
  40. childrenBitsNodeType = 2
  41. childrenBitsHi = 14
  42. childrenBitsLo = 14
  43. )
  44. var (
  45. maxChildren int
  46. maxTextOffset int
  47. maxTextLength int
  48. maxHi uint32
  49. maxLo uint32
  50. )
  51. func max(a, b int) int {
  52. if a < b {
  53. return b
  54. }
  55. return a
  56. }
  57. func u32max(a, b uint32) uint32 {
  58. if a < b {
  59. return b
  60. }
  61. return a
  62. }
  63. const (
  64. nodeTypeNormal = 0
  65. nodeTypeException = 1
  66. nodeTypeParentOnly = 2
  67. numNodeType = 3
  68. )
  69. func nodeTypeStr(n int) string {
  70. switch n {
  71. case nodeTypeNormal:
  72. return "+"
  73. case nodeTypeException:
  74. return "!"
  75. case nodeTypeParentOnly:
  76. return "o"
  77. }
  78. panic("unreachable")
  79. }
  80. const (
  81. defaultURL = "https://publicsuffix.org/list/effective_tld_names.dat"
  82. gitCommitURL = "https://api.github.com/repos/publicsuffix/list/commits?path=public_suffix_list.dat"
  83. )
  84. var (
  85. labelEncoding = map[string]uint32{}
  86. labelsList = []string{}
  87. labelsMap = map[string]bool{}
  88. rules = []string{}
  89. // validSuffixRE is used to check that the entries in the public suffix
  90. // list are in canonical form (after Punycode encoding). Specifically,
  91. // capital letters are not allowed.
  92. validSuffixRE = regexp.MustCompile(`^[a-z0-9_\!\*\-\.]+$`)
  93. shaRE = regexp.MustCompile(`"sha":"([^"]+)"`)
  94. dateRE = regexp.MustCompile(`"committer":{[^{]+"date":"([^"]+)"`)
  95. comments = flag.Bool("comments", false, "generate table.go comments, for debugging")
  96. subset = flag.Bool("subset", false, "generate only a subset of the full table, for debugging")
  97. url = flag.String("url", defaultURL, "URL of the publicsuffix.org list. If empty, stdin is read instead")
  98. v = flag.Bool("v", false, "verbose output (to stderr)")
  99. version = flag.String("version", "", "the effective_tld_names.dat version")
  100. )
  101. func main() {
  102. if err := main1(); err != nil {
  103. fmt.Fprintln(os.Stderr, err)
  104. os.Exit(1)
  105. }
  106. }
  107. func main1() error {
  108. flag.Parse()
  109. if nodesBitsTextLength+nodesBitsTextOffset+nodesBitsICANN+nodesBitsChildren > 32 {
  110. return fmt.Errorf("not enough bits to encode the nodes table")
  111. }
  112. if childrenBitsLo+childrenBitsHi+childrenBitsNodeType+childrenBitsWildcard > 32 {
  113. return fmt.Errorf("not enough bits to encode the children table")
  114. }
  115. if *version == "" {
  116. if *url != defaultURL {
  117. return fmt.Errorf("-version was not specified, and the -url is not the default one")
  118. }
  119. sha, date, err := gitCommit()
  120. if err != nil {
  121. return err
  122. }
  123. *version = fmt.Sprintf("publicsuffix.org's public_suffix_list.dat, git revision %s (%s)", sha, date)
  124. }
  125. var r io.Reader = os.Stdin
  126. if *url != "" {
  127. res, err := http.Get(*url)
  128. if err != nil {
  129. return err
  130. }
  131. if res.StatusCode != http.StatusOK {
  132. return fmt.Errorf("bad GET status for %s: %d", *url, res.Status)
  133. }
  134. r = res.Body
  135. defer res.Body.Close()
  136. }
  137. var root node
  138. icann := false
  139. br := bufio.NewReader(r)
  140. for {
  141. s, err := br.ReadString('\n')
  142. if err != nil {
  143. if err == io.EOF {
  144. break
  145. }
  146. return err
  147. }
  148. s = strings.TrimSpace(s)
  149. if strings.Contains(s, "BEGIN ICANN DOMAINS") {
  150. icann = true
  151. continue
  152. }
  153. if strings.Contains(s, "END ICANN DOMAINS") {
  154. icann = false
  155. continue
  156. }
  157. if s == "" || strings.HasPrefix(s, "//") {
  158. continue
  159. }
  160. s, err = idna.ToASCII(s)
  161. if err != nil {
  162. return err
  163. }
  164. if !validSuffixRE.MatchString(s) {
  165. return fmt.Errorf("bad publicsuffix.org list data: %q", s)
  166. }
  167. if *subset {
  168. switch {
  169. case s == "ac.jp" || strings.HasSuffix(s, ".ac.jp"):
  170. case s == "ak.us" || strings.HasSuffix(s, ".ak.us"):
  171. case s == "ao" || strings.HasSuffix(s, ".ao"):
  172. case s == "ar" || strings.HasSuffix(s, ".ar"):
  173. case s == "arpa" || strings.HasSuffix(s, ".arpa"):
  174. case s == "cy" || strings.HasSuffix(s, ".cy"):
  175. case s == "dyndns.org" || strings.HasSuffix(s, ".dyndns.org"):
  176. case s == "jp":
  177. case s == "kobe.jp" || strings.HasSuffix(s, ".kobe.jp"):
  178. case s == "kyoto.jp" || strings.HasSuffix(s, ".kyoto.jp"):
  179. case s == "om" || strings.HasSuffix(s, ".om"):
  180. case s == "uk" || strings.HasSuffix(s, ".uk"):
  181. case s == "uk.com" || strings.HasSuffix(s, ".uk.com"):
  182. case s == "tw" || strings.HasSuffix(s, ".tw"):
  183. case s == "zw" || strings.HasSuffix(s, ".zw"):
  184. case s == "xn--p1ai" || strings.HasSuffix(s, ".xn--p1ai"):
  185. // xn--p1ai is Russian-Cyrillic "рф".
  186. default:
  187. continue
  188. }
  189. }
  190. rules = append(rules, s)
  191. nt, wildcard := nodeTypeNormal, false
  192. switch {
  193. case strings.HasPrefix(s, "*."):
  194. s, nt = s[2:], nodeTypeParentOnly
  195. wildcard = true
  196. case strings.HasPrefix(s, "!"):
  197. s, nt = s[1:], nodeTypeException
  198. }
  199. labels := strings.Split(s, ".")
  200. for n, i := &root, len(labels)-1; i >= 0; i-- {
  201. label := labels[i]
  202. n = n.child(label)
  203. if i == 0 {
  204. if nt != nodeTypeParentOnly && n.nodeType == nodeTypeParentOnly {
  205. n.nodeType = nt
  206. }
  207. n.icann = n.icann && icann
  208. n.wildcard = n.wildcard || wildcard
  209. }
  210. labelsMap[label] = true
  211. }
  212. }
  213. labelsList = make([]string, 0, len(labelsMap))
  214. for label := range labelsMap {
  215. labelsList = append(labelsList, label)
  216. }
  217. sort.Strings(labelsList)
  218. if err := generate(printReal, &root, "table.go"); err != nil {
  219. return err
  220. }
  221. if err := generate(printTest, &root, "table_test.go"); err != nil {
  222. return err
  223. }
  224. return nil
  225. }
  226. func generate(p func(io.Writer, *node) error, root *node, filename string) error {
  227. buf := new(bytes.Buffer)
  228. if err := p(buf, root); err != nil {
  229. return err
  230. }
  231. b, err := format.Source(buf.Bytes())
  232. if err != nil {
  233. return err
  234. }
  235. return ioutil.WriteFile(filename, b, 0644)
  236. }
  237. func gitCommit() (sha, date string, retErr error) {
  238. res, err := http.Get(gitCommitURL)
  239. if err != nil {
  240. return "", "", err
  241. }
  242. if res.StatusCode != http.StatusOK {
  243. return "", "", fmt.Errorf("bad GET status for %s: %d", gitCommitURL, res.Status)
  244. }
  245. defer res.Body.Close()
  246. b, err := ioutil.ReadAll(res.Body)
  247. if err != nil {
  248. return "", "", err
  249. }
  250. if m := shaRE.FindSubmatch(b); m != nil {
  251. sha = string(m[1])
  252. }
  253. if m := dateRE.FindSubmatch(b); m != nil {
  254. date = string(m[1])
  255. }
  256. if sha == "" || date == "" {
  257. retErr = fmt.Errorf("could not find commit SHA and date in %s", gitCommitURL)
  258. }
  259. return sha, date, retErr
  260. }
  261. func printTest(w io.Writer, n *node) error {
  262. fmt.Fprintf(w, "// generated by go run gen.go; DO NOT EDIT\n\n")
  263. fmt.Fprintf(w, "package publicsuffix\n\nvar rules = [...]string{\n")
  264. for _, rule := range rules {
  265. fmt.Fprintf(w, "%q,\n", rule)
  266. }
  267. fmt.Fprintf(w, "}\n\nvar nodeLabels = [...]string{\n")
  268. if err := n.walk(w, printNodeLabel); err != nil {
  269. return err
  270. }
  271. fmt.Fprintf(w, "}\n")
  272. return nil
  273. }
  274. func printReal(w io.Writer, n *node) error {
  275. const header = `// generated by go run gen.go; DO NOT EDIT
  276. package publicsuffix
  277. const version = %q
  278. const (
  279. nodesBitsChildren = %d
  280. nodesBitsICANN = %d
  281. nodesBitsTextOffset = %d
  282. nodesBitsTextLength = %d
  283. childrenBitsWildcard = %d
  284. childrenBitsNodeType = %d
  285. childrenBitsHi = %d
  286. childrenBitsLo = %d
  287. )
  288. const (
  289. nodeTypeNormal = %d
  290. nodeTypeException = %d
  291. nodeTypeParentOnly = %d
  292. )
  293. // numTLD is the number of top level domains.
  294. const numTLD = %d
  295. `
  296. fmt.Fprintf(w, header, *version,
  297. nodesBitsChildren, nodesBitsICANN, nodesBitsTextOffset, nodesBitsTextLength,
  298. childrenBitsWildcard, childrenBitsNodeType, childrenBitsHi, childrenBitsLo,
  299. nodeTypeNormal, nodeTypeException, nodeTypeParentOnly, len(n.children))
  300. text := combineText(labelsList)
  301. if text == "" {
  302. return fmt.Errorf("internal error: makeText returned no text")
  303. }
  304. for _, label := range labelsList {
  305. offset, length := strings.Index(text, label), len(label)
  306. if offset < 0 {
  307. return fmt.Errorf("internal error: could not find %q in text %q", label, text)
  308. }
  309. maxTextOffset, maxTextLength = max(maxTextOffset, offset), max(maxTextLength, length)
  310. if offset >= 1<<nodesBitsTextOffset {
  311. return fmt.Errorf("text offset %d is too large, or nodeBitsTextOffset is too small", offset)
  312. }
  313. if length >= 1<<nodesBitsTextLength {
  314. return fmt.Errorf("text length %d is too large, or nodeBitsTextLength is too small", length)
  315. }
  316. labelEncoding[label] = uint32(offset)<<nodesBitsTextLength | uint32(length)
  317. }
  318. fmt.Fprintf(w, "// Text is the combined text of all labels.\nconst text = ")
  319. for len(text) > 0 {
  320. n, plus := len(text), ""
  321. if n > 64 {
  322. n, plus = 64, " +"
  323. }
  324. fmt.Fprintf(w, "%q%s\n", text[:n], plus)
  325. text = text[n:]
  326. }
  327. if err := n.walk(w, assignIndexes); err != nil {
  328. return err
  329. }
  330. fmt.Fprintf(w, `
  331. // nodes is the list of nodes. Each node is represented as a uint32, which
  332. // encodes the node's children, wildcard bit and node type (as an index into
  333. // the children array), ICANN bit and text.
  334. //
  335. // If the table was generated with the -comments flag, there is a //-comment
  336. // after each node's data. In it is the nodes-array indexes of the children,
  337. // formatted as (n0x1234-n0x1256), with * denoting the wildcard bit. The
  338. // nodeType is printed as + for normal, ! for exception, and o for parent-only
  339. // nodes that have children but don't match a domain label in their own right.
  340. // An I denotes an ICANN domain.
  341. //
  342. // The layout within the uint32, from MSB to LSB, is:
  343. // [%2d bits] unused
  344. // [%2d bits] children index
  345. // [%2d bits] ICANN bit
  346. // [%2d bits] text index
  347. // [%2d bits] text length
  348. var nodes = [...]uint32{
  349. `,
  350. 32-nodesBitsChildren-nodesBitsICANN-nodesBitsTextOffset-nodesBitsTextLength,
  351. nodesBitsChildren, nodesBitsICANN, nodesBitsTextOffset, nodesBitsTextLength)
  352. if err := n.walk(w, printNode); err != nil {
  353. return err
  354. }
  355. fmt.Fprintf(w, `}
  356. // children is the list of nodes' children, the parent's wildcard bit and the
  357. // parent's node type. If a node has no children then their children index
  358. // will be in the range [0, 6), depending on the wildcard bit and node type.
  359. //
  360. // The layout within the uint32, from MSB to LSB, is:
  361. // [%2d bits] unused
  362. // [%2d bits] wildcard bit
  363. // [%2d bits] node type
  364. // [%2d bits] high nodes index (exclusive) of children
  365. // [%2d bits] low nodes index (inclusive) of children
  366. var children=[...]uint32{
  367. `,
  368. 32-childrenBitsWildcard-childrenBitsNodeType-childrenBitsHi-childrenBitsLo,
  369. childrenBitsWildcard, childrenBitsNodeType, childrenBitsHi, childrenBitsLo)
  370. for i, c := range childrenEncoding {
  371. s := "---------------"
  372. lo := c & (1<<childrenBitsLo - 1)
  373. hi := (c >> childrenBitsLo) & (1<<childrenBitsHi - 1)
  374. if lo != hi {
  375. s = fmt.Sprintf("n0x%04x-n0x%04x", lo, hi)
  376. }
  377. nodeType := int(c>>(childrenBitsLo+childrenBitsHi)) & (1<<childrenBitsNodeType - 1)
  378. wildcard := c>>(childrenBitsLo+childrenBitsHi+childrenBitsNodeType) != 0
  379. if *comments {
  380. fmt.Fprintf(w, "0x%08x, // c0x%04x (%s)%s %s\n",
  381. c, i, s, wildcardStr(wildcard), nodeTypeStr(nodeType))
  382. } else {
  383. fmt.Fprintf(w, "0x%x,\n", c)
  384. }
  385. }
  386. fmt.Fprintf(w, "}\n\n")
  387. fmt.Fprintf(w, "// max children %d (capacity %d)\n", maxChildren, 1<<nodesBitsChildren-1)
  388. fmt.Fprintf(w, "// max text offset %d (capacity %d)\n", maxTextOffset, 1<<nodesBitsTextOffset-1)
  389. fmt.Fprintf(w, "// max text length %d (capacity %d)\n", maxTextLength, 1<<nodesBitsTextLength-1)
  390. fmt.Fprintf(w, "// max hi %d (capacity %d)\n", maxHi, 1<<childrenBitsHi-1)
  391. fmt.Fprintf(w, "// max lo %d (capacity %d)\n", maxLo, 1<<childrenBitsLo-1)
  392. return nil
  393. }
  394. type node struct {
  395. label string
  396. nodeType int
  397. icann bool
  398. wildcard bool
  399. // nodesIndex and childrenIndex are the index of this node in the nodes
  400. // and the index of its children offset/length in the children arrays.
  401. nodesIndex, childrenIndex int
  402. // firstChild is the index of this node's first child, or zero if this
  403. // node has no children.
  404. firstChild int
  405. // children are the node's children, in strictly increasing node label order.
  406. children []*node
  407. }
  408. func (n *node) walk(w io.Writer, f func(w1 io.Writer, n1 *node) error) error {
  409. if err := f(w, n); err != nil {
  410. return err
  411. }
  412. for _, c := range n.children {
  413. if err := c.walk(w, f); err != nil {
  414. return err
  415. }
  416. }
  417. return nil
  418. }
  419. // child returns the child of n with the given label. The child is created if
  420. // it did not exist beforehand.
  421. func (n *node) child(label string) *node {
  422. for _, c := range n.children {
  423. if c.label == label {
  424. return c
  425. }
  426. }
  427. c := &node{
  428. label: label,
  429. nodeType: nodeTypeParentOnly,
  430. icann: true,
  431. }
  432. n.children = append(n.children, c)
  433. sort.Sort(byLabel(n.children))
  434. return c
  435. }
  436. type byLabel []*node
  437. func (b byLabel) Len() int { return len(b) }
  438. func (b byLabel) Swap(i, j int) { b[i], b[j] = b[j], b[i] }
  439. func (b byLabel) Less(i, j int) bool { return b[i].label < b[j].label }
  440. var nextNodesIndex int
  441. // childrenEncoding are the encoded entries in the generated children array.
  442. // All these pre-defined entries have no children.
  443. var childrenEncoding = []uint32{
  444. 0 << (childrenBitsLo + childrenBitsHi), // Without wildcard bit, nodeTypeNormal.
  445. 1 << (childrenBitsLo + childrenBitsHi), // Without wildcard bit, nodeTypeException.
  446. 2 << (childrenBitsLo + childrenBitsHi), // Without wildcard bit, nodeTypeParentOnly.
  447. 4 << (childrenBitsLo + childrenBitsHi), // With wildcard bit, nodeTypeNormal.
  448. 5 << (childrenBitsLo + childrenBitsHi), // With wildcard bit, nodeTypeException.
  449. 6 << (childrenBitsLo + childrenBitsHi), // With wildcard bit, nodeTypeParentOnly.
  450. }
  451. var firstCallToAssignIndexes = true
  452. func assignIndexes(w io.Writer, n *node) error {
  453. if len(n.children) != 0 {
  454. // Assign nodesIndex.
  455. n.firstChild = nextNodesIndex
  456. for _, c := range n.children {
  457. c.nodesIndex = nextNodesIndex
  458. nextNodesIndex++
  459. }
  460. // The root node's children is implicit.
  461. if firstCallToAssignIndexes {
  462. firstCallToAssignIndexes = false
  463. return nil
  464. }
  465. // Assign childrenIndex.
  466. maxChildren = max(maxChildren, len(childrenEncoding))
  467. if len(childrenEncoding) >= 1<<nodesBitsChildren {
  468. return fmt.Errorf("children table size %d is too large, or nodeBitsChildren is too small", len(childrenEncoding))
  469. }
  470. n.childrenIndex = len(childrenEncoding)
  471. lo := uint32(n.firstChild)
  472. hi := lo + uint32(len(n.children))
  473. maxLo, maxHi = u32max(maxLo, lo), u32max(maxHi, hi)
  474. if lo >= 1<<childrenBitsLo {
  475. return fmt.Errorf("children lo %d is too large, or childrenBitsLo is too small", lo)
  476. }
  477. if hi >= 1<<childrenBitsHi {
  478. return fmt.Errorf("children hi %d is too large, or childrenBitsHi is too small", hi)
  479. }
  480. enc := hi<<childrenBitsLo | lo
  481. enc |= uint32(n.nodeType) << (childrenBitsLo + childrenBitsHi)
  482. if n.wildcard {
  483. enc |= 1 << (childrenBitsLo + childrenBitsHi + childrenBitsNodeType)
  484. }
  485. childrenEncoding = append(childrenEncoding, enc)
  486. } else {
  487. n.childrenIndex = n.nodeType
  488. if n.wildcard {
  489. n.childrenIndex += numNodeType
  490. }
  491. }
  492. return nil
  493. }
  494. func printNode(w io.Writer, n *node) error {
  495. for _, c := range n.children {
  496. s := "---------------"
  497. if len(c.children) != 0 {
  498. s = fmt.Sprintf("n0x%04x-n0x%04x", c.firstChild, c.firstChild+len(c.children))
  499. }
  500. encoding := labelEncoding[c.label]
  501. if c.icann {
  502. encoding |= 1 << (nodesBitsTextLength + nodesBitsTextOffset)
  503. }
  504. encoding |= uint32(c.childrenIndex) << (nodesBitsTextLength + nodesBitsTextOffset + nodesBitsICANN)
  505. if *comments {
  506. fmt.Fprintf(w, "0x%08x, // n0x%04x c0x%04x (%s)%s %s %s %s\n",
  507. encoding, c.nodesIndex, c.childrenIndex, s, wildcardStr(c.wildcard),
  508. nodeTypeStr(c.nodeType), icannStr(c.icann), c.label,
  509. )
  510. } else {
  511. fmt.Fprintf(w, "0x%x,\n", encoding)
  512. }
  513. }
  514. return nil
  515. }
  516. func printNodeLabel(w io.Writer, n *node) error {
  517. for _, c := range n.children {
  518. fmt.Fprintf(w, "%q,\n", c.label)
  519. }
  520. return nil
  521. }
  522. func icannStr(icann bool) string {
  523. if icann {
  524. return "I"
  525. }
  526. return " "
  527. }
  528. func wildcardStr(wildcard bool) string {
  529. if wildcard {
  530. return "*"
  531. }
  532. return " "
  533. }
  534. // combineText combines all the strings in labelsList to form one giant string.
  535. // Overlapping strings will be merged: "arpa" and "parliament" could yield
  536. // "arparliament".
  537. func combineText(labelsList []string) string {
  538. beforeLength := 0
  539. for _, s := range labelsList {
  540. beforeLength += len(s)
  541. }
  542. text := crush(removeSubstrings(labelsList))
  543. if *v {
  544. fmt.Fprintf(os.Stderr, "crushed %d bytes to become %d bytes\n", beforeLength, len(text))
  545. }
  546. return text
  547. }
  548. type byLength []string
  549. func (s byLength) Len() int { return len(s) }
  550. func (s byLength) Swap(i, j int) { s[i], s[j] = s[j], s[i] }
  551. func (s byLength) Less(i, j int) bool { return len(s[i]) < len(s[j]) }
  552. // removeSubstrings returns a copy of its input with any strings removed
  553. // that are substrings of other provided strings.
  554. func removeSubstrings(input []string) []string {
  555. // Make a copy of input.
  556. ss := append(make([]string, 0, len(input)), input...)
  557. sort.Sort(byLength(ss))
  558. for i, shortString := range ss {
  559. // For each string, only consider strings higher than it in sort order, i.e.
  560. // of equal length or greater.
  561. for _, longString := range ss[i+1:] {
  562. if strings.Contains(longString, shortString) {
  563. ss[i] = ""
  564. break
  565. }
  566. }
  567. }
  568. // Remove the empty strings.
  569. sort.Strings(ss)
  570. for len(ss) > 0 && ss[0] == "" {
  571. ss = ss[1:]
  572. }
  573. return ss
  574. }
  575. // crush combines a list of strings, taking advantage of overlaps. It returns a
  576. // single string that contains each input string as a substring.
  577. func crush(ss []string) string {
  578. maxLabelLen := 0
  579. for _, s := range ss {
  580. if maxLabelLen < len(s) {
  581. maxLabelLen = len(s)
  582. }
  583. }
  584. for prefixLen := maxLabelLen; prefixLen > 0; prefixLen-- {
  585. prefixes := makePrefixMap(ss, prefixLen)
  586. for i, s := range ss {
  587. if len(s) <= prefixLen {
  588. continue
  589. }
  590. mergeLabel(ss, i, prefixLen, prefixes)
  591. }
  592. }
  593. return strings.Join(ss, "")
  594. }
  595. // mergeLabel merges the label at ss[i] with the first available matching label
  596. // in prefixMap, where the last "prefixLen" characters in ss[i] match the first
  597. // "prefixLen" characters in the matching label.
  598. // It will merge ss[i] repeatedly until no more matches are available.
  599. // All matching labels merged into ss[i] are replaced by "".
  600. func mergeLabel(ss []string, i, prefixLen int, prefixes prefixMap) {
  601. s := ss[i]
  602. suffix := s[len(s)-prefixLen:]
  603. for _, j := range prefixes[suffix] {
  604. // Empty strings mean "already used." Also avoid merging with self.
  605. if ss[j] == "" || i == j {
  606. continue
  607. }
  608. if *v {
  609. fmt.Fprintf(os.Stderr, "%d-length overlap at (%4d,%4d): %q and %q share %q\n",
  610. prefixLen, i, j, ss[i], ss[j], suffix)
  611. }
  612. ss[i] += ss[j][prefixLen:]
  613. ss[j] = ""
  614. // ss[i] has a new suffix, so merge again if possible.
  615. // Note: we only have to merge again at the same prefix length. Shorter
  616. // prefix lengths will be handled in the next iteration of crush's for loop.
  617. // Can there be matches for longer prefix lengths, introduced by the merge?
  618. // I believe that any such matches would by necessity have been eliminated
  619. // during substring removal or merged at a higher prefix length. For
  620. // instance, in crush("abc", "cde", "bcdef"), combining "abc" and "cde"
  621. // would yield "abcde", which could be merged with "bcdef." However, in
  622. // practice "cde" would already have been elimintated by removeSubstrings.
  623. mergeLabel(ss, i, prefixLen, prefixes)
  624. return
  625. }
  626. }
  627. // prefixMap maps from a prefix to a list of strings containing that prefix. The
  628. // list of strings is represented as indexes into a slice of strings stored
  629. // elsewhere.
  630. type prefixMap map[string][]int
  631. // makePrefixMap constructs a prefixMap from a slice of strings.
  632. func makePrefixMap(ss []string, prefixLen int) prefixMap {
  633. prefixes := make(prefixMap)
  634. for i, s := range ss {
  635. // We use < rather than <= because if a label matches on a prefix equal to
  636. // its full length, that's actually a substring match handled by
  637. // removeSubstrings.
  638. if prefixLen < len(s) {
  639. prefix := s[:prefixLen]
  640. prefixes[prefix] = append(prefixes[prefix], i)
  641. }
  642. }
  643. return prefixes
  644. }