spider/perl/BadWords.pm

111 lines
1.7 KiB
Perl
Raw Normal View History

#
# Search for bad words in strings
#
# Copyright (c) 2000 Dirk Koopman
#
2007-06-24 01:17:43 +00:00
#
#
package BadWords;
use strict;
use DXUtil;
use DXVars;
use DXHash;
use DXDebug;
use IO::File;
2001-10-04 15:52:12 +00:00
use vars qw($badword $regexcode);
my $oldfn = "$main::data/badwords";
my $regex = "$main::data/badw_regex";
my $bwfn = "$main::data/badword";
# copy issue ones across
filecopy("$regex.gb.issue", $regex) unless -e $regex;
filecopy("$bwfn.issue", $bwfn) unless -e $bwfn;
$badword = new DXHash "badword";
# load the badwords file
sub load
{
my @out;
my $fh = new IO::File $oldfn;
if ($fh) {
while (<$fh>) {
chomp;
next if /^\s*\#/;
my @list = split " ";
for (@list) {
$badword->add($_);
}
}
$fh->close;
$badword->put;
unlink $oldfn;
}
push @out, create_regex();
return @out;
}
sub create_regex
{
my @out;
my $fh = new IO::File $regex;
if ($fh) {
2001-10-04 15:52:12 +00:00
my $s = "sub { my \$str = shift; my \@out; \n";
while (<$fh>) {
chomp;
next if /^\s*\#/;
my @list = split " ";
for (@list) {
# create a closure for each word so that it matches stuff with spaces/punctuation
# and repeated characters in it
my $w = uc $_;
my @l = split //, $w;
2001-10-04 15:52:12 +00:00
my $e = join '+[\s\W]*', @l;
$s .= "push \@out, \$1 if \$str =~ /\\b($e)/;\n";
}
}
2001-10-04 15:52:12 +00:00
$s .= "return \@out;\n}";
$regexcode = eval $s;
dbg($s) if isdbg('badword');
if ($@) {
@out = ($@);
dbg($@);
return @out;
}
$fh->close;
} else {
my $l = "can't open $regex $!";
dbg($l);
push @out, $l;
}
return @out;
}
# check the text against the badwords list
sub check
{
my $s = uc shift;
my @out;
2001-10-04 15:52:12 +00:00
push @out, &$regexcode($s) if $regexcode;
return @out if @out;
for (split(/\b/, $s)) {
push @out, $_ if $badword->in($_);
}
return @out;
}
1;