diff options
author | Luke T. Shumaker <lukeshu@lukeshu.com> | 2024-06-08 17:51:41 -0600 |
---|---|---|
committer | Luke T. Shumaker <lukeshu@lukeshu.com> | 2024-06-08 17:51:41 -0600 |
commit | 5dc2e9533a111d75ff91a56dd50af8e03ebf5f5f (patch) | |
tree | e90d2b74612ecb44fb0e41a19e44483f90a071ba | |
parent | f6080300406a674419dba5005c76bc424df35502 (diff) |
wip pipermail threading
-rw-r--r-- | cmd/generate/forge_github.go | 68 | ||||
-rw-r--r-- | cmd/generate/forge_pipermail.go | 133 | ||||
-rw-r--r-- | cmd/generate/mailstuff/mbox.go | 38 | ||||
-rw-r--r-- | cmd/generate/mailstuff/thread.go | 248 | ||||
-rw-r--r-- | cmd/generate/src_contribs.go | 19 |
5 files changed, 445 insertions, 61 deletions
diff --git a/cmd/generate/forge_github.go b/cmd/generate/forge_github.go index d3618ce..d29e3f7 100644 --- a/cmd/generate/forge_github.go +++ b/cmd/generate/forge_github.go @@ -7,7 +7,10 @@ import ( "time" ) -var reGitHubPR = regexp.MustCompile(`^https://github\.com/([^/?#]+)/([^/?#]+)/pull/([0-9]+)(?:\?[^#]*)?(?:#.*)?$`) +var ( + reGitHubPR = regexp.MustCompile(`^https://github\.com/([^/?#]+)/([^/?#]+)/pull/([0-9]+)(?:\?[^#]*)?(?:#.*)?$`) + reGitHubCommit = regexp.MustCompile(`^https://github\.com/([^/?#]+)/([^/?#]+)/commit/([0-9a-f]+)(?:\?[^#]*)?(?:#.*)?$`) +) func githubPagination(i int) url.Values { params := make(url.Values) @@ -20,6 +23,7 @@ type GitHub struct{} var _ Forge = GitHub{} func (GitHub) FetchStatus(urls []string) (string, error) { + // PR for _, u := range urls { m := reGitHubPR.FindStringSubmatch(u) if m == nil { @@ -54,6 +58,31 @@ func (GitHub) FetchStatus(urls []string) (string, error) { return ret, nil } + // Commits from a non-PR + var gitURL string + var gitCommits []string + for _, u := range urls { + if m := reGitHubCommit.FindStringSubmatch(u); m != nil { + user := m[1] + repo := m[2] + hash := m[3] + + gitURL = "https://github.com/" + user + "/" + repo + gitCommits = append(gitCommits, hash) + } + } + if len(gitCommits) > 0 { + ret := statusMerged + tag, err := getGitTagThatContainsAll(gitURL, gitCommits...) + if err != nil { + return "", err + } + if tag != "" { + ret = fmt.Sprintf(statusReleasedFmt, tag) + } + return ret, nil + } + // Nope return "", nil } @@ -81,6 +110,7 @@ func (GitHub) FetchSubmittedAt(urls []string) (time.Time, error) { } func (GitHub) FetchLastUpdated(urls []string) (time.Time, User, error) { + // PR for _, u := range urls { m := reGitHubPR.FindStringSubmatch(u) if m == nil { @@ -184,5 +214,41 @@ func (GitHub) FetchLastUpdated(urls []string) (time.Time, User, error) { return retUpdatedAt, retUser, nil } + // Commits from a non-PR + { + var ret time.Time + for _, u := range urls { + if m := reGitHubCommit.FindStringSubmatch(u); m != nil { + user := m[1] + repo := m[2] + hash := m[3] + + urlStr := "https://api.github.com/repos/" + user + "/" + repo + "/commits/" + hash + var obj struct { + Commit struct { + Author struct { + Date time.Time `json:"date"` + } `json:"author"` + Committer struct { + Date time.Time `json:"date"` + } `json:"committer"` + } `json:"commit"` + } + if err := httpGetJSON(urlStr, nil, &obj); err != nil { + return time.Time{}, User{}, err + } + if obj.Commit.Author.Date.After(ret) { + ret = obj.Commit.Author.Date + } + if obj.Commit.Committer.Date.After(ret) { + ret = obj.Commit.Committer.Date + } + } + } + if ret.IsZero() { + return time.Time{}, User{}, nil + } + } + // Nope return time.Time{}, User{}, nil } diff --git a/cmd/generate/forge_pipermail.go b/cmd/generate/forge_pipermail.go index 2c5cf01..e015bb5 100644 --- a/cmd/generate/forge_pipermail.go +++ b/cmd/generate/forge_pipermail.go @@ -2,14 +2,17 @@ package main import ( "fmt" + "net/url" "regexp" + "strconv" "strings" "time" ) var ( - rePiperMailDate = regexp.MustCompile(`^\s*<I>([^<]+)</I>\s*$`) - reGitHubCommit = regexp.MustCompile(`^https://github\.com/([^/?#]+)/([^/?#]+)/commit/([0-9a-f]+)(?:\?[^#]*)?(?:#.*)?$`) + rePiperMailMessage = regexp.MustCompile(`^(https?://.*/pipermail/.*/)([0-4]{4}-(?:January|February|March|April|May|June|July|August|September|October|November|December))/([0-9]+)\.html$`) + rePiperMailDate = regexp.MustCompile(`^\s*<I>([^<]+)</I>\s*$`) + rePiperMailReply = regexp.MustCompile(`^\s*<LINK REL="made" HREF="(.*)">\s$`) ) type PiperMail struct{} @@ -17,35 +20,12 @@ type PiperMail struct{} var _ Forge = PiperMail{} func (PiperMail) FetchStatus(urls []string) (string, error) { - var gitURL string - var gitCommits []string - for _, u := range urls { - if m := reGitHubCommit.FindStringSubmatch(u); m != nil { - user := m[1] - repo := m[2] - hash := m[3] - - gitURL = "https://github.com/" + user + "/" + repo - gitCommits = append(gitCommits, hash) - } - } - if len(gitCommits) == 0 { - return "", nil - } - ret := statusMerged - tag, err := getGitTagThatContainsAll(gitURL, gitCommits...) - if err != nil { - return "", err - } - if tag != "" { - ret = fmt.Sprintf(statusReleasedFmt, tag) - } - return ret, nil + return "", nil } func (PiperMail) FetchSubmittedAt(urls []string) (time.Time, error) { for _, u := range urls { - if !strings.Contains(u, "/pipermail/") { + if !rePiperMailMessage.MatchString(u) { continue } htmlStr, err := httpGet(u, nil) @@ -61,38 +41,79 @@ func (PiperMail) FetchSubmittedAt(urls []string) (time.Time, error) { return time.Time{}, nil } -func (PiperMail) FetchLastUpdated(urls []string) (time.Time, User, error) { - var ret time.Time +func (PiperMail) nextMonth(ym string) string { + yStr, mStr, ok := strings.Cut(ym, "-") + if !ok { + panic(fmt.Errorf("invalid year-month: %q", ym)) + } + switch mStr { + case "January": + return yStr + "-February" + case "February": + return yStr + "-March" + case "March": + return yStr + "-April" + case "April": + return yStr + "-May" + case "May": + return yStr + "-June" + case "June": + return yStr + "-July" + case "July": + return yStr + "-August" + case "August": + return yStr + "-September" + case "September": + return yStr + "-October" + case "October": + return yStr + "-November" + case "November": + return yStr + "-December" + case "December": + y, _ := strconv.Atoi(yStr) + return fmt.Sprintf("%d-January", y+1) + default: + panic(fmt.Errorf("invalid year-month: %q", ym)) + } +} + +func (PiperMail) messageID(u string) (string, error) { +} + + +func (p PiperMail) FetchLastUpdated(urls []string) (time.Time, User, error) { for _, u := range urls { - if m := reGitHubCommit.FindStringSubmatch(u); m != nil { - user := m[1] - repo := m[2] - hash := m[3] + m := rePiperMailMessage.FindStringSubmatch(u) + if m == nil { + continue + } + uBase := m[1] + uYM := m[2] + //uInt := m[3] - urlStr := "https://api.github.com/repos/" + user + "/" + repo + "/commits/" + hash - var obj struct { - Commit struct { - Author struct { - Date time.Time `json:"date"` - } `json:"author"` - Committer struct { - Date time.Time `json:"date"` - } `json:"committer"` - } `json:"commit"` - } - if err := httpGetJSON(urlStr, nil, &obj); err != nil { - return time.Time{}, User{}, err - } - if obj.Commit.Author.Date.After(ret) { - ret = obj.Commit.Author.Date - } - if obj.Commit.Committer.Date.After(ret) { - ret = obj.Commit.Committer.Date + htmlStr, err := httpGet(u, nil) + if err != nil { + return time.Time{}, User{}, err + } + var msgid string + for _, line := range strings.Split(htmlStr, "\n") { + if m := rePiperMailReply.FindStringSubmatch(line); m != nil { + ru, err := url.Parse(m[1]) + if err != nil { + continue + } + if msgid = ru.Query().Get("In-Reply-To"); msgid != "" { + break + } } } + if msgid == "" { + continue + } + mboxStr, err := httpGet(uBase+uYM+".txt.gz", nil) + if err != nil { + return time.Time{}, User{}, err + } } - if ret.IsZero() { - return time.Time{}, User{}, nil - } - return ret, User{}, nil + return time.Time{}, User{}, nil } diff --git a/cmd/generate/mailstuff/mbox.go b/cmd/generate/mailstuff/mbox.go new file mode 100644 index 0000000..8700c24 --- /dev/null +++ b/cmd/generate/mailstuff/mbox.go @@ -0,0 +1,38 @@ +package mailstuff + +import ( + "bytes" + "io" + "net/mail" +) + +func ReadMBox(r io.Reader) ([]*mail.Message, error) { + rest, err := io.ReadAll(r) + if err != nil { + return nil, err + } + + const terminator = "\nFrom " + + var parts [][]byte + for { + pos := bytes.Index(rest, []byte(terminator)) + if pos < 0 { + parts = append(parts, rest) + break + } + parts = append(parts, rest[:pos+1]) + rest = rest[pos+1:] + } + + ret := make([]*mail.Message, len(parts)) + for i := range len(parts) { + msg, err := mail.ReadMessage(bytes.NewReader(parts[i])) + if err != nil { + return nil, err + } + ret[i] = msg + } + + return ret, nil +} diff --git a/cmd/generate/mailstuff/thread.go b/cmd/generate/mailstuff/thread.go new file mode 100644 index 0000000..c6fa181 --- /dev/null +++ b/cmd/generate/mailstuff/thread.go @@ -0,0 +1,248 @@ +package mailstuff + +import ( + "regexp" + "strings" +) + +type set[T comparable] map[T]struct{} + +func (s set[T]) Insert(val T) { + s[val] = struct{}{} +} + +func (s set[T]) Has(val T) bool { + _, ok := s[val] + return ok +} + +func (s set[T]) PickOne() T { + for v := range s { + return v + } + var zero T + return zero +} + +//////////////////////////////////////////////////////////////////////////////// + +// https://www.jwz.org/doc/threading.html + +// Definitions ///////////////////////////////////////////////////////////////// + +type jwzContainer struct { + Message *jwzMessage + Parent *jwzContainer + Children set[*jwzContainer] +} + +type jwzMessage struct { + Subject string + ID jwzID + References []jwzID +} + +type jwzID string + +func (ancestor *jwzContainer) IsAncestorOf(descendent *jwzContainer) bool { + if ancestor == descendent { + return true + } + for child := range ancestor.Children { + if child.IsAncestorOf(descendent) { + return true + } + } + return false +} + +// The Algorithm /////////////////////////////////////////////////////////////// + +var jwzSubjectRE = regexp.MustCompile(`^(?:\s*[Rr][Ee](?:\[[0-9]+\])?:)*`) + +func jwzThreadMessages(msgs map[jwzID]*jwzMessage) set[*jwzContainer] { + idTable := make(map[jwzID]*jwzContainer, len(msgs)) + + // 1. For each message + for _, msg := range msgs { + // A. + msgContainer := idTable[msg.ID] + if msgContainer != nil && msgContainer.Message == nil { + msgContainer.Message = msg + } else { + msgContainer = &jwzContainer{ + Message: msg, + Children: make(set[*jwzContainer]), + } + idTable[msg.ID] = msgContainer + } + // B. + for _, refID := range msg.References { + refContainer := idTable[refID] + if refContainer == nil { + refContainer = &jwzContainer{ + Children: make(set[*jwzContainer]), + } + idTable[refID] = refContainer + } + } + for i := 0; i+1 < len(msg.References); i++ { + parent := idTable[msg.References[i]] + child := idTable[msg.References[i+1]] + if !parent.IsAncestorOf(child) && !child.IsAncestorOf(parent) { + parent.Children.Insert(child) + child.Parent = parent + } + } + // C. + if len(msg.References) == 0 { + if msgContainer.Parent != nil { + delete(msgContainer.Parent.Children, msgContainer) + } + msgContainer.Parent = nil + } else { + msgContainer.Parent = idTable[msg.References[len(msg.References)-1]] + msgContainer.Parent.Children.Insert(msgContainer) + } + } + + // 2. Find the root set + roots := make(set[*jwzContainer]) + for _, container := range idTable { + if container.Parent == nil { + roots.Insert(container) + } + } + + // 3. Discard id_table + idTable = nil + + // 4. Prune empty containers + pseudoRoot := &jwzContainer{ + Children: roots, + } + for root := range roots { + root.Parent = pseudoRoot + } + var recurse func(*jwzContainer) + recurse = func(container *jwzContainer) { + // Recurse. This is a touch complicated because + // `recurse(child)` might insert into + // `container.Children`, and those insertions might + // not be emitted by the range loop + for visited := make(set[*jwzContainer]); ; { + beforeSize := len(visited) + for child := range container.Children { + if visited.Has(child) { + continue + } + recurse(child) + visited.Insert(child) + } + if len(visited) == beforeSize { + break + } + } + // Main. + if container.Message == nil { + if len(container.Children) == 0 { // A. + delete(container.Parent.Children, container) + } else { // B. + if len(container.Children) == 1 || container.Parent != pseudoRoot { + for child := range container.Children { + container.Parent.Children.Insert(child) + child.Parent = container.Parent + } + delete(container.Parent.Children, container) + } + } + } + } + for root := range roots { + recurse(root) + } + for root := range roots { + root.Parent = nil + } + pseudoRoot = nil + + // 5. Group root set by subject + // A. + subjectTable := make(map[string]*jwzContainer) + // B. + for root := range roots { + var subject string + if root.Message != nil { + subject = root.Message.Subject + } else { + subject = root.Children.PickOne().Message.Subject + } + prefix := jwzSubjectRE.FindString(subject) + subject = strings.TrimSpace(subject[len(prefix):]) + if subject == "" { + continue + } + if other := subjectTable[subject]; other == nil { + subjectTable[subject] = root + } else if other.Message == nil { + subjectTable[subject] = root + } else if jwzSubjectRE.MatchString(other.Message.Subject) && prefix == "" { + subjectTable[subject] = root + } + } + // C. + for root := range roots { + var subject string + if root.Message != nil { + subject = root.Message.Subject + } else { + subject = root.Children.PickOne().Message.Subject + } + prefix := jwzSubjectRE.FindString(subject) + subject = strings.TrimSpace(subject[len(prefix):]) + + other := subjectTable[subject] + if other == nil || other == root { + continue + } + + switch { + case root.Message == nil && other.Message == nil: + for child := range root.Children { + other.Children.Insert(child) + child.Parent = other + } + delete(roots, root) + case (root.Message == nil) != (other.Message == nil): + var empty, nonEmpty *jwzContainer + if root.Message == nil { + empty = root + nonEmpty = other + } else { + empty = other + nonEmpty = root + } + empty.Children.Insert(nonEmpty) + nonEmpty.Parent = empty + case other.Message != nil && !jwzSubjectRE.MatchString(other.Message.Subject) && prefix != "": + other.Children.Insert(root) + root.Parent = other + // skip the reverse of the above case--it happened implicitly + default: + newRoot := &jwzContainer{ + Children: make(set[*jwzContainer], 2), + } + newRoot.Children.Insert(root) + root.Parent = newRoot + newRoot.Children.Insert(other) + other.Parent = newRoot + subjectTable[subject] = newRoot + roots.Insert(newRoot) + delete(roots, root) + delete(roots, other) + } + } + + // 6. Now you're done threading + return roots +} diff --git a/cmd/generate/src_contribs.go b/cmd/generate/src_contribs.go index 9c7bcd6..0ead1cd 100644 --- a/cmd/generate/src_contribs.go +++ b/cmd/generate/src_contribs.go @@ -130,15 +130,19 @@ func (c Contribution) fetchStatus() (string, error) { } func (c Contribution) fetchSubmittedAt() (time.Time, error) { + var ret time.Time for _, forge := range forges { submittedAt, err := forge.FetchSubmittedAt(c.URLs) if err != nil { return time.Time{}, err } - if !submittedAt.IsZero() { - return submittedAt, nil + if !submittedAt.IsZero() && (ret.IsZero() || submittedAt.Before(ret)) { + ret = submittedAt } } + if !ret.IsZero() { + return ret, nil + } return time.Time{}, fmt.Errorf("idk how to get created timestamp for %q", c.URLs[0]) } @@ -151,14 +155,21 @@ func withinOneSecond(a, b time.Time) bool { } func (c Contribution) fetchLastUpdated() (time.Time, User, error) { + var ret struct { + time.Time + User + } for _, forge := range forges { updatedAt, updatedBy, err := forge.FetchLastUpdated(c.URLs) if err != nil { return time.Time{}, User{}, err } - if !updatedAt.IsZero() { - return updatedAt, updatedBy, nil + if !updatedAt.IsZero() && (ret.Time.IsZero() || updatedAt.After(ret.Time)) { + ret.Time, ret.User = updatedAt, updatedBy } } + if !ret.Time.IsZero() { + return ret.Time, ret.User, nil + } return time.Time{}, User{}, nil //fmt.Errorf("idk how to get updated timestamp for %q", c.URLs[0]) } |