extensions/ConfirmEdit/captcha.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253

#!/usr/bin/python
#
# Script to generate distorted text images for a captcha system.
#
# Copyright (C) 2005 Neil Harris
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License along
# with this program; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
# http://www.gnu.org/copyleft/gpl.html
#
# Further tweaks by Brion Vibber <brion@pobox.com>:
# 2006-01-26: Add command-line options for the various parameters
# 2007-02-19: Add --dirs param for hash subdirectory splits
# Tweaks by Greg Sabino Mullane <greg@turnstep.com>:
# 2008-01-06: Add regex check to skip words containing other than a-z

import random
import math
import hashlib
from optparse import OptionParser
import os
import sys
import re

try:
	import Image
	import ImageFont
	import ImageDraw
	import ImageEnhance
	import ImageOps
except:
	sys.exit("This script requires the Python Imaging Library - http://www.pythonware.com/products/pil/")

nonalpha = re.compile('[^a-z]') # regex to test for suitability of words

# Does X-axis wobbly copy, sandwiched between two rotates
def wobbly_copy(src, wob, col, scale, ang):
	x, y = src.size
	f = random.uniform(4*scale, 5*scale)
	p = random.uniform(0, math.pi*2)
	rr = ang+random.uniform(-30, 30) # vary, but not too much
	int_d = Image.new('RGB', src.size, 0) # a black rectangle
	rot = src.rotate(rr, Image.BILINEAR)
	# Do a cheap bounding-box op here to try to limit work below
	bbx = rot.getbbox()
	if bbx == None:
		return src
	else:
		l, t, r, b= bbx
	# and only do lines with content on
	for i in range(t, b+1):
		# Drop a scan line in
		xoff = int(math.sin(p+(i*f/y))*wob)
		xoff += int(random.uniform(-wob*0.5, wob*0.5))
		int_d.paste(rot.crop((0, i, x, i+1)), (xoff, i))
	# try to stop blurring from building up
	int_d = int_d.rotate(-rr, Image.BILINEAR)
	enh = ImageEnhance.Sharpness(int_d)
	return enh.enhance(2)


def gen_captcha(text, fontname, fontsize, file_name):
	"""Generate a captcha image"""
	# white text on a black background
	bgcolor = 0x0
	fgcolor = 0xffffff
	# create a font object 
	font = ImageFont.truetype(fontname,fontsize)
	# determine dimensions of the text
	dim = font.getsize(text)
	# create a new image significantly larger that the text
	edge = max(dim[0], dim[1]) + 2*min(dim[0], dim[1])
	im = Image.new('RGB', (edge, edge), bgcolor)
	d = ImageDraw.Draw(im)
	x, y = im.size
	# add the text to the image
	d.text((x/2-dim[0]/2, y/2-dim[1]/2), text, font=font, fill=fgcolor)
	k = 3
	wob = 0.20*dim[1]/k
	rot = 45
	# Apply lots of small stirring operations, rather than a few large ones
	# in order to get some uniformity of treatment, whilst
	# maintaining randomness
	for i in range(k):
		im = wobbly_copy(im, wob, bgcolor, i*2+3, rot+0)
		im = wobbly_copy(im, wob, bgcolor, i*2+1, rot+45)
		im = wobbly_copy(im, wob, bgcolor, i*2+2, rot+90)
		rot += 30
	
	# now get the bounding box of the nonzero parts of the image
	bbox = im.getbbox()
	bord = min(dim[0], dim[1])/4 # a bit of a border
	im = im.crop((bbox[0]-bord, bbox[1]-bord, bbox[2]+bord, bbox[3]+bord))
	# and turn into black on white
	im = ImageOps.invert(im)
		
	# save the image, in format determined from filename
	im.save(file_name)

def gen_subdir(basedir, md5hash, levels):
	"""Generate a subdirectory path out of the first _levels_
	characters of _hash_, and ensure the directories exist
	under _basedir_."""
	subdir = None
	for i in range(0, levels):
		char = md5hash[i]
		if subdir:
			subdir = os.path.join(subdir, char)
		else:
			subdir = char
		fulldir = os.path.join(basedir, subdir)
		if not os.path.exists(fulldir):
			os.mkdir(fulldir)
	return subdir

def try_pick_word(words, blacklist, verbose, nwords, min_length, max_length):
	if words is not None:
		word = words[random.randint(0,len(words)-1)]
		while nwords > 1:
			word2 = words[random.randint(0,len(words)-1)]
			word = word + word2
			nwords = nwords - 1
	else:
		word = ''
		max_length = max_length if max_length > 0 else 10
		for i in range(0, random.randint(min_length, max_length)):
			word = word + chr(97 + random.randint(0,25))

	if verbose:
		print "word is %s" % word

	if len(word) < min_length:
		if verbose:
			print "skipping word pair '%s' because it has fewer than %d characters" % (word, min_length)
		return None

	if max_length > 0 and len(word) > max_length:
		if verbose:
			print "skipping word pair '%s' because it has more than %d characters" % (word, max_length)
		return None

	if nonalpha.search(word):
		if verbose:
			print "skipping word pair '%s' because it contains non-alphabetic characters" % word
		return None

	for naughty in blacklist:
		if naughty in word:
			if verbose:
				print "skipping word pair '%s' because it contains blacklisted word '%s'" % (word, naughty)
			return None
	return word

def pick_word(words, blacklist, verbose, nwords, min_length, max_length):
	for x in range(1000): # If we can't find a valid combination in 1000 tries, just give up
		word = try_pick_word(words, blacklist, verbose, nwords, min_length, max_length)
		if word:
			return word
	sys.exit("Unable to find valid word combinations")

def read_wordlist(filename):
	f = open(filename)
	words = [x.strip().lower() for x in f.readlines()]
	f.close()
	return words

if __name__ == '__main__':
	"""This grabs random words from the dictionary 'words' (one
	word per line) and generates a captcha image for each one,
	with a keyed salted hash of the correct answer in the filename.
	
	To check a reply, hash it in the same way with the same salt and
	secret key, then compare with the hash value given.
	"""
	script_dir = os.path.dirname(os.path.realpath(__file__))
	parser = OptionParser()
	parser.add_option("--wordlist", help="A list of words (required)", metavar="WORDS.txt")
	parser.add_option("--random", help="Use random charcters instead of a wordlist", action="store_true")
	parser.add_option("--key", help="The passphrase set as $wgCaptchaSecret (required)", metavar="KEY")
	parser.add_option("--output", help="The directory to put the images in - $wgCaptchaDirectory (required)", metavar="DIR")
	parser.add_option("--font", help="The font to use (required)", metavar="FONT.ttf")
	parser.add_option("--font-size", help="The font size (default 40)", metavar="N", type='int', default=40)
	parser.add_option("--count", help="The maximum number of images to make (default 20)", metavar="N", type='int', default=20)
	parser.add_option("--blacklist", help="A blacklist of words that should not be used", metavar="FILE", default=os.path.join(script_dir, "blacklist"))
	parser.add_option("--fill", help="Fill the output directory to contain N files, overrides count, cannot be used with --dirs", metavar="N", type='int')
	parser.add_option("--dirs", help="Put the images into subdirectories N levels deep - $wgCaptchaDirectoryLevels", metavar="N", type='int')
	parser.add_option("--verbose", "-v", help="Show debugging information", action='store_true')
	parser.add_option("--number-words", help="Number of words from the wordlist which make a captcha challenge (default 2)", type='int', default=2)
	parser.add_option("--min-length", help="Minimum length for a captcha challenge", type='int', default=1)
	parser.add_option("--max-length", help="Maximum length for a captcha challenge", type='int', default=-1)
	
	opts, args = parser.parse_args()

	if opts.wordlist:
		wordlist = opts.wordlist
	elif opts.random:
		wordlist = None
	else:
		sys.exit("Need to specify a wordlist")
	if opts.key:
		key = opts.key
	else:
		sys.exit("Need to specify a key")
	if opts.output:
		output = opts.output
	else:
		sys.exit("Need to specify an output directory")
	if opts.font and os.path.exists(opts.font):
		font = opts.font
	else:
		sys.exit("Need to specify the location of a font")

	blacklist = read_wordlist(opts.blacklist)
	count = opts.count
	fill = opts.fill
	dirs = opts.dirs
	verbose = opts.verbose
	fontsize = opts.font_size

	if fill:
		count = max(0, fill - len(os.listdir(output)))

	words = None
	if wordlist:
		words = read_wordlist(wordlist)
		words = [x for x in words
			if len(x) in (4,5) and x[0] != "f"
			and x[0] != x[1] and x[-1] != x[-2]]

	for i in range(count):
		word = pick_word(words, blacklist, verbose, opts.number_words, opts.min_length, opts.max_length)
		salt = "%08x" % random.randrange(2**32)
		# 64 bits of hash is plenty for this purpose
		md5hash = hashlib.md5(key+salt+word+key+salt).hexdigest()[:16]
		filename = "image_%s_%s.png" % (salt, md5hash)
		if dirs:
			subdir = gen_subdir(output, md5hash, dirs)
			filename = os.path.join(subdir, filename)
		if verbose:
			print filename
		gen_captcha(word, font, fontsize, os.path.join(output, filename))