summaryrefslogtreecommitdiff
path: root/elisp/erbot/erburl.el
diff options
context:
space:
mode:
Diffstat (limited to 'elisp/erbot/erburl.el')
-rw-r--r--elisp/erbot/erburl.el219
1 files changed, 219 insertions, 0 deletions
diff --git a/elisp/erbot/erburl.el b/elisp/erbot/erburl.el
new file mode 100644
index 0000000..4995b45
--- /dev/null
+++ b/elisp/erbot/erburl.el
@@ -0,0 +1,219 @@
+;;; erburl.el --- Learn terms from a url.
+;; Time-stamp:
+;; Copyright (C) 2004 Pete Kazmier
+;; Emacs Lisp Archive entry
+;; Filename: erburl.el
+;; Package: erburl
+;; Author: Pete Kazmier <pete-erbot-dev@kazmier.com>
+;; Keywords:
+;; Version:
+;; URL: http://www.emacswiki.org/cgi-bin/wiki.pl?ErBot
+
+(defconst erbtrain-home-page
+ "http://www.emacswiki.org/cgi-bin/wiki.pl?ErBot")
+
+
+;; This file is NOT (yet) part of GNU Emacs.
+
+;; This is free software; you can redistribute it and/or modify
+;; it under the terms of the GNU General Public License as published by
+;; the Free Software Foundation; either version 3, or (at your option)
+;; any later version.
+
+;; This is distributed in the hope that it will be useful,
+;; but WITHOUT ANY WARRANTY; without even the implied warranty of
+;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+;; GNU General Public License for more details.
+
+;; You should have received a copy of the GNU General Public License
+;; along with GNU Emacs; see the file COPYING. If not, write to the
+;; Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+;; Boston, MA 02111-1307, USA.
+
+
+;; See also:
+
+
+;; Quick start:
+(defconst erburl-quick-start
+ "This library enables one to add and remove entries to your bot's
+bbdb that have been \"scraped\" from sources on the web. When using
+this library, you must be running under the uid of your bot. In
+addition, (although I'm not positive), you should make sure that only
+one instance of your bot is running to avoid clobbering the bbdb. I
+use this library in the emacs session of my running bot. Although the
+main function of this library is asynchronous, when adding a lot of
+terms, the local session is not really useable (you see the progress
+though), however the bot still responds to requests from its channels.
+
+With that all said, lets add the terms from a wiki using the wiki page
+name as the term. The note attached to the term will be a string in
+the form of \"at URL\" where URL is the URL to the page:
+
+ (erburl-scrape-terms
+ \"http://www.emacswiki.org/cgi-bin/wiki?action=index\")
+
+This library can be used to add terms from any web page because you
+can pass your own parser to ERBURL-SCRAPE-TERMS (see the doc string
+for the full details). This includes parsing pages and adding notes
+that contain information other than a simple link back to the original
+page.
+
+The library also includes a function to remove all entries that
+contain a specific URL in the notes of an entry. It will only remove
+the term in its entirety if it does not contain other notes for the
+same term. It should be noted that this function is not asynchronous
+and will cause your bot to stop responding on channels if it is
+deleting a large number of records.
+
+Finally, for an additional reference to using this library, please see
+erbjavadoc which uses this library to provide a command that will
+permit users to add javadoc entries from a set of javadoc pages.
+"
+ )
+
+(defun erburl-quick-start ()
+ "Provides electric help from variable `erburl-quick-start'."
+ (interactive)
+ (with-electric-help
+ '(lambda () (insert erburl-quick-start) nil) "*doc*"))
+
+(defconst erburl-version "0.0dev")
+
+;;==========================================
+;;; Requires:
+(require 'cl)
+(require 'url)
+
+;;; Code:
+
+(defgroup erburl nil
+ "The group erburl."
+ :group 'applications)
+
+(defcustom erburl-before-load-hooks nil
+ "Hooks to run before loading erburl."
+ :group 'erburl)
+
+(defcustom erburl-after-load-hooks nil
+ "Hooks to run after loading erburl."
+ :group 'erburl)
+
+(run-hooks 'erburl-before-load-hooks)
+
+;;; Real Code:
+
+
+(defun erburl-scrape-terms (url &optional entry-parser-fn progress-callback cbargs)
+ "Scrape terms from URL using the ENTRY-PARSER-FN and add them to the
+erbot's botbbdb. Due to the asynchronous nature of this call, messages
+are sent to PROGRESS-CALLBACK to report process.
+
+ENTRY-PARSER-FN is called when the contents of the URL have finished
+downloading into a buffer. The contents of the buffer include any
+headers that were sent followed by a blank line and then followed by
+the actual contents of the URL. When ENTRY-PARSER-FN is called, this
+buffer has already been selected. ENTRY-PARSER-FN is passed CBARGS as
+arguments, and must return a list of entries to be added to the bbdb.
+Each entry should be a list of two elements with the term as the first
+element and the definition as the second. The default parser used if
+one is not specified is ERBURL-HREF-PARSER (which parses href links).
+
+PROGRESS-CALLBACK is called once after the entries have been added to
+the bbdb with a descriptive message indicating how many terms were
+added. It may also be called after the entries have been parsed with
+a message indicating that it will take a significant amount of time to
+add the entries to the bbdb. When PROGRESS-CALLBACK is called, it is
+passed a message as the first argument and then CBARGS are passed as
+additional arguments. The default callback used if one is not
+specified is MESSAGE.
+
+CBARGS are passed as additional argements to both of the callback
+functions.
+"
+ (let ((parser (or entry-parser-fn 'erburl-href-parser))
+ (progress (or progress-callback 'message)))
+ (url-retrieve url
+ 'erburl-scrape-callback
+ (list url parser progress cbargs))))
+
+(defun erburl-scrape-callback (url entry-parser-fn progress-callback cbargs)
+ "Callback invoked by url-retrieve. It is invoked in the buffer with
+the contents of the retrieved URL. In addition, this method is passed
+two additional callbacks to assist during processing (please refer to
+erburl-scrape-terms doc). Finally, CBARGS is a list of arguments that
+will be passed as additional arguments to the callback functions (I
+wish elisp supported lexical closures!)"
+ (goto-char (point-min))
+ (let* ((buffer (current-buffer))
+ (count 0)
+ (entries (apply entry-parser-fn cbargs))
+ (delay 0.1)
+ (total (length entries))
+ (eta (* total delay)))
+ (when (> eta 10)
+ (apply progress-callback
+ (format "Processing %d entries from %s will take at least %.1f minutes ..."
+ total url (/ eta 60))
+ cbargs))
+ (erbot-working
+ (dolist (entry entries)
+ (message "Adding entry for %s" (first entry))
+ (sleep-for 0.1)
+ ;; I need to find a way to speed this up. As the bbdb gets
+ ;; larger things really start to slow down significantly.
+ (when (or (ignore-errors (apply 'fs-set-also entry))
+ (ignore-errors (apply 'fs-set-term entry)))
+ (incf count))))
+ (erbbdb-save)
+ (apply progress-callback
+ (format "Added %d entries from %s." count url)
+ cbargs)
+ (kill-buffer buffer)))
+
+;; This needs to be asynchronous if we are to make an fsi-* version
+;; for IRC users to execute because this function is very slow when
+;; removing a large number of entries from the bbdb.
+(defun erburl-forget-terms (url)
+ "Remove all terms and entries for the specified URL. This will
+remove terms from the bbdb entirely unless a particular term has more
+than one entry, in which case, only the relevant entry is removed.
+Note: this function is not asynchronous and will cause your bot to
+stop responding on channels if it is removing a large number of
+entries that match the specified URL."
+ (unless (string-match "^https?://[^/]+/" url)
+ (error "The specified URL is not well-formed"))
+ (let ((count 0)
+ (regexp (regexp-quote url))
+ (erbforget-interactivity -100))
+ (erbot-working
+ (setq count (erbforget-sw regexp nil t)))
+ (erbbdb-save)
+ count))
+
+(defun erburl-href-parser (&optional base terms-with-spaces-p)
+ "Returns a list of lists representing the HTML links in the current
+buffer. Each list is composed of a term and a string indicating the
+link which is prefixed with BASE if supplied. If TERMS-WITH-SPACES-P
+is non-nil, only links with single word text will be included."
+ (let ((entries '())
+ (case-fold-search t))
+ (while (re-search-forward
+ (if terms-with-spaces-p
+ "<a .*?href=\"\\([^\"]+\\)\".*?>\\(?:<[^>]+>\\)*\\([^ <]+\\)\\(?:<[^>]+>\\)*</a>"
+ "<a .*?href=\"\\([^\"]+\\)\".*?>\\(?:<[^>]+>\\)*\\([^<]+\\)\\(?:<[^>]+>\\)*</a>")
+ nil t)
+ (push (list (match-string 2)
+ (concat "at "
+ (when base (concat base "/"))
+ (match-string 1)))
+ entries))
+ entries))
+
+(defun erburl-safe-url (url)
+ )
+
+(provide 'erburl)
+(run-hooks 'erburl-after-load-hooks)
+
+;;; erburl.el ends here