spider/perl/BadWords.pm

#
# Search for bad words in strings
#
# Copyright (c) 2000 Dirk Koopman
#
# $Id$
#

package BadWords;

use strict;

use DXUtil;
use DXVars;
use DXHash;
use DXDebug;

use IO::File;

use vars qw($badword @regex);

my $oldfn = "$main::data/badwords";
my $regex = "$main::data/badw_regex";
my $bwfn = "$main::data/badword";

# copy issue ones across
filecopy("$regex.issue", $regex) unless -e $regex;
filecopy("$bwfn.issue", $bwfn) unless -e $bwfn;

$badword = new DXHash "badword";

use vars qw($VERSION $BRANCH);
$VERSION = sprintf( "%d.%03d", q$Revision$ =~ /(\d+)\.(\d+)/ );
$BRANCH = sprintf( "%d.%03d", q$Revision$ =~ /\d+\.\d+\.(\d+)\.(\d+)/ ) || 0;
$main::build += $VERSION;
$main::branch += $BRANCH;

# load the badwords file
sub load
{
	my @out;
	my $fh = new IO::File $oldfn;
	
	if ($fh) {
		while (<$fh>) {
			chomp;
			next if /^\s*\#/;
			my @list = split " ";
			for (@list) {
				$badword->add($_);
			}
		}
		$fh->close;
		$badword->put;
		unlink $oldfn;
	}
	push @out, create_regex(); 
	return @out;
}

sub create_regex
{
	my @out;
	@regex = ();
	
	my $fh = new IO::File $regex;
	
	if ($fh) {
		while (<$fh>) {
			chomp;
			next if /^\s*\#/;
			my @list = split " ";
			for (@list) {
				# create a closure for each word so that it matches stuff with spaces/punctuation
				# and repeated characters in it
				my $w = uc $_;
				my @l = split //, $w;
				my $e = join '+[\s\W]+', @l;
				my $s = eval qq{sub { return \$_[0] =~ /$e+/ ? '$w' : () } };
				push @regex, $s unless $@;
				dbg("create_regex: $@") if $@;
			}
		}
		$fh->close;
	} else {
		my $l = "can't open $regex $!";
		dbg($l);
		push @out, $l;
	}
	
	return @out;
}

# check the text against the badwords list
sub check
{
	my $s = uc shift;
	my @out;
	
	for (@regex) {
		push @out, &$_($s);
	}
	
	return @out if @out;
	
	for (split(/\s+/, $s)) {
		s/\'?S$//;
		push @out, $_ if $badword->in($_);
	}

	return @out;
}

1;
fix a talk bug for t xxx > yyy added badword checking 2000-09-02 15:28:14 +00:00			`#`
			`# Search for bad words in strings`
			`#`
			`# Copyright (c) 2000 Dirk Koopman`
			`#`
			`# $Id$`
			`#`

			`package BadWords;`

			`use strict;`

			`use DXUtil;`
			`use DXVars;`
5. Change the badwords interface to be the same as baddx, badspotter et al. added set/badword, unset/badword and show/badword. This routine will auto convert (and delete afterwards) the old badwords file. Also make the ann->talk thingy less aggressive 2001-09-13 19:58:05 +00:00			`use DXHash;`
7. improved the regex matching of badwords (more efficient, better coverage) 8. added default badword and badw_regex tables (as .issue files) which will activate unless there is one there already. 2001-10-04 13:53:47 +00:00			`use DXDebug;`

fix a talk bug for t xxx > yyy added badword checking 2000-09-02 15:28:14 +00:00			`use IO::File;`

7. improved the regex matching of badwords (more efficient, better coverage) 8. added default badword and badw_regex tables (as .issue files) which will activate unless there is one there already. 2001-10-04 13:53:47 +00:00			`use vars qw($badword @regex);`
fix a talk bug for t xxx > yyy added badword checking 2000-09-02 15:28:14 +00:00
5. Change the badwords interface to be the same as baddx, badspotter et al. added set/badword, unset/badword and show/badword. This routine will auto convert (and delete afterwards) the old badwords file. Also make the ann->talk thingy less aggressive 2001-09-13 19:58:05 +00:00			`my $oldfn = "$main::data/badwords";`
7. improved the regex matching of badwords (more efficient, better coverage) 8. added default badword and badw_regex tables (as .issue files) which will activate unless there is one there already. 2001-10-04 13:53:47 +00:00			`my $regex = "$main::data/badw_regex";`
			`my $bwfn = "$main::data/badword";`

			`# copy issue ones across`
			`filecopy("$regex.issue", $regex) unless -e $regex;`
			`filecopy("$bwfn.issue", $bwfn) unless -e $bwfn;`

5. Change the badwords interface to be the same as baddx, badspotter et al. added set/badword, unset/badword and show/badword. This routine will auto convert (and delete afterwards) the old badwords file. Also make the ann->talk thingy less aggressive 2001-09-13 19:58:05 +00:00			`$badword = new DXHash "badword";`
fix a talk bug for t xxx > yyy added badword checking 2000-09-02 15:28:14 +00:00
change build number calculation to be more accurate 2001-09-01 12:15:09 +00:00			`use vars qw($VERSION $BRANCH);`
			`$VERSION = sprintf( "%d.%03d", q$Revision$ =~ /(\d+)\.(\d+)/ );`
			`$BRANCH = sprintf( "%d.%03d", q$Revision$ =~ /\d+\.\d+\.(\d+)\.(\d+)/ ) \|\| 0;`
			`$main::build += $VERSION;`
			`$main::branch += $BRANCH;`

fix a talk bug for t xxx > yyy added badword checking 2000-09-02 15:28:14 +00:00			`# load the badwords file`
			`sub load`
			`{`
			`my @out;`
5. Change the badwords interface to be the same as baddx, badspotter et al. added set/badword, unset/badword and show/badword. This routine will auto convert (and delete afterwards) the old badwords file. Also make the ann->talk thingy less aggressive 2001-09-13 19:58:05 +00:00			`my $fh = new IO::File $oldfn;`
fix a talk bug for t xxx > yyy added badword checking 2000-09-02 15:28:14 +00:00
			`if ($fh) {`
			`while (<$fh>) {`
			`chomp;`
			`next if /^\s*\#/;`
			`my @list = split " ";`
			`for (@list) {`
5. Change the badwords interface to be the same as baddx, badspotter et al. added set/badword, unset/badword and show/badword. This routine will auto convert (and delete afterwards) the old badwords file. Also make the ann->talk thingy less aggressive 2001-09-13 19:58:05 +00:00			`$badword->add($_);`
fix a talk bug for t xxx > yyy added badword checking 2000-09-02 15:28:14 +00:00			`}`
			`}`
			`$fh->close;`
5. Change the badwords interface to be the same as baddx, badspotter et al. added set/badword, unset/badword and show/badword. This routine will auto convert (and delete afterwards) the old badwords file. Also make the ann->talk thingy less aggressive 2001-09-13 19:58:05 +00:00			`$badword->put;`
			`unlink $oldfn;`
7. improved the regex matching of badwords (more efficient, better coverage) 8. added default badword and badw_regex tables (as .issue files) which will activate unless there is one there already. 2001-10-04 13:53:47 +00:00			`}`
			`push @out, create_regex();`
			`return @out;`
			`}`

			`sub create_regex`
			`{`
			`my @out;`
			`@regex = ();`

			`my $fh = new IO::File $regex;`

			`if ($fh) {`
			`while (<$fh>) {`
			`chomp;`
			`next if /^\s*\#/;`
			`my @list = split " ";`
			`for (@list) {`
			`# create a closure for each word so that it matches stuff with spaces/punctuation`
			`# and repeated characters in it`
			`my $w = uc $_;`
remove automatic [I1] and [O0] creation in regexes 2001-10-04 14:46:41 +00:00			`my @l = split //, $w;`
7. improved the regex matching of badwords (more efficient, better coverage) 8. added default badword and badw_regex tables (as .issue files) which will activate unless there is one there already. 2001-10-04 13:53:47 +00:00			`my $e = join '+[\s\W]+', @l;`
			`my $s = eval qq{sub { return \$_[0] =~ /$e+/ ? '$w' : () } };`
			`push @regex, $s unless $@;`
			`dbg("create_regex: $@") if $@;`
			`}`
			`}`
			`$fh->close;`
fix a talk bug for t xxx > yyy added badword checking 2000-09-02 15:28:14 +00:00			`} else {`
7. improved the regex matching of badwords (more efficient, better coverage) 8. added default badword and badw_regex tables (as .issue files) which will activate unless there is one there already. 2001-10-04 13:53:47 +00:00			`my $l = "can't open $regex $!";`
5. Change the badwords interface to be the same as baddx, badspotter et al. added set/badword, unset/badword and show/badword. This routine will auto convert (and delete afterwards) the old badwords file. Also make the ann->talk thingy less aggressive 2001-09-13 19:58:05 +00:00			`dbg($l);`
fix a talk bug for t xxx > yyy added badword checking 2000-09-02 15:28:14 +00:00			`push @out, $l;`
			`}`
7. improved the regex matching of badwords (more efficient, better coverage) 8. added default badword and badw_regex tables (as .issue files) which will activate unless there is one there already. 2001-10-04 13:53:47 +00:00
fix a talk bug for t xxx > yyy added badword checking 2000-09-02 15:28:14 +00:00			`return @out;`
			`}`

			`# check the text against the badwords list`
			`sub check`
			`{`
Try each badword with an S on the end as well and also check for 'f.u c' type things as well (this only works for a few wellknown english ones). 2001-10-01 14:30:18 +00:00			`my $s = uc shift;`
7. improved the regex matching of badwords (more efficient, better coverage) 8. added default badword and badw_regex tables (as .issue files) which will activate unless there is one there already. 2001-10-04 13:53:47 +00:00			`my @out;`

			`for (@regex) {`
			`push @out, &$_($s);`
			`}`

			`return @out if @out;`
Try each badword with an S on the end as well and also check for 'f.u c' type things as well (this only works for a few wellknown english ones). 2001-10-01 14:30:18 +00:00
			`for (split(/\s+/, $s)) {`
			`s/\'?S$//;`
7. improved the regex matching of badwords (more efficient, better coverage) 8. added default badword and badw_regex tables (as .issue files) which will activate unless there is one there already. 2001-10-04 13:53:47 +00:00			`push @out, $_ if $badword->in($_);`
Try each badword with an S on the end as well and also check for 'f.u c' type things as well (this only works for a few wellknown english ones). 2001-10-01 14:30:18 +00:00			`}`
7. improved the regex matching of badwords (more efficient, better coverage) 8. added default badword and badw_regex tables (as .issue files) which will activate unless there is one there already. 2001-10-04 13:53:47 +00:00
			`return @out;`
fix a talk bug for t xxx > yyy added badword checking 2000-09-02 15:28:14 +00:00			`}`

			`1;`