ASi

searchwords.pl : parallel grep by multi processes

#!/bin/perl

# Description:
# Search words from text
#
# Usage:
# Please run this command without any args and see document outputed.
#
# For cygwin you need to install Win32::Semaphore module.
# Run "cpan Win32::Semaphore".
# Beforehand you may need to install 
# - GNU make
# - GNU gcc
# - GNU g++
# - libgpgme
# - patchutils
# - wget
# - libcrypt-devel (on x86_64)
# - cpan YAML
# - cpan ExtUtils::CBuilder
# ahead by runnning Cygwin installer.

use strict;
use Time::HiRes qw(usleep);
use IPC::SysV qw(IPC_CREAT IPC_PRIVATE SEM_UNDO S_IWUSR);
use IPC::Semaphore;
use POSIX ":sys_wait_h";
use Config;

my $CYGWIN = 0;
if ($Config{"osname"} eq "cygwin"){
	$CYGWIN = 1;
	require Win32::Semaphore;
}

# initilization

my $CNTONLY = 0;

my $textFileName;
my $wordsFileName;

my $rootPid = $$;
my %childrenPid;
my $childrenCnt = 0;

my $sem;
if ($CYGWIN){
	$sem = Win32::Semaphore->new(1, 1, "searchwords.pl");
}else{
	$sem = IPC::Semaphore->new(IPC_PRIVATE, 1, IPC_CREAT | S_IWUSR);
}
if (!$sem) {
	print "failed to create ipc sem\n";
	exit 1;
}
if (!$CYGWIN){
	$sem->setval(0, 1);
}

sub ipcEnter{
	if ($CYGWIN){
		$sem->wait();
	}else{
		$sem->op(0, -1, SEM_UNDO);
	}
	#print "entered\n";
}
sub ipcLeave{
	if ($CYGWIN){
		$sem->release();
	}else{
		$sem->op(0, 1, 0);
	}
	#print "left\n";
}

# termination

sub cleanup{
	if ($CYGWIN){
	}else{
		$sem->remove();
	}
	logd("cleanuped\n");
}

$SIG{'INT'} = sub {
	if ($rootPid != $$){
		exit 1;
	}
	print "Caught SIG INT...\n";
	ipcEnter();
	my $key;
	my $value;
	while (($key, $value) = each(%childrenPid)){
		print "kill $key\n";
		kill $key;
	}
	ipcLeave();
	
	cleanup();
	
	print "terminated\n";
	exit 1;
};

# parse args

if (@ARGV == 0){
	print <<'EOT';
Usage:
    searchwords.pl [options] [target_text] [words_list_file] [options]

    e.g. $ perl --count searchwords.pl target.txt words.txt

words_list_file:
    Each word should be separated by \n.

options:
--count
    Output count of words only.
EOT
	exit 1;
}

for(my $i = 0 ; $i < @ARGV ; ++$i){
	if ($ARGV[$i] =~ /^-/){
		if ($ARGV[$i] == "--count"){
			$CNTONLY = 1;
		}
	}elsif (length($textFileName) == 0){
		$textFileName = $ARGV[$i];
	}else{
		$wordsFileName = $ARGV[$i];
	}
}
if (length($textFileName) == 0 || length($wordsFileName) == 0){
	print "files should be specified!\n"; 
	exit(1);
}

# main

print "text : $textFileName\n";

if(!open(WORDS,"<","$wordsFileName")){
	print "file open error!\n"; 
	exit(1);
}

my $paramWords;

my $CHILDREN_LIMIT = 8;
my $WORDS_PER_CHILD = 1;
my $i = 0;
my $wordInfo;
while(<WORDS>){
	my $word = $_;
	
	# prepare param for command
	$word =~ s/\n//;
	if (length($word) == 0){
		next;
	}
	$word =~ s/ /\\ /g;		# escape 0x20
	$word =~ s/\#/\\\#/g;	# escape #
	$word =~ s/\(/\\\(/g;	# escape (
	$word =~ s/\)/\\\)/g;	# escape )
	$word =~ s/\t/\\t/g;	# escape \t
	$paramWords .= "-e $word ";
	
	$i++;
	if ($i >= $WORDS_PER_CHILD){
		# run command by children
		forkChild($wordInfo);
		$i = 0;
		$paramWords = "";
	}
	
	# monitor children
	if ($childrenCnt >= $CHILDREN_LIMIT){
		my $child = waitpid(-1, 0);
		my $key;
		my $value;
		if ($child == -1){
			logd("there are not any children\n");
			ipcEnter();
			while (($key, $value) = each(%childrenPid)){
				delete($childrenPid{$key});
			}
			$childrenCnt = 0;
			ipcLeave();
		}else{
			while (($key, $value) = each(%childrenPid)){
				if ($child == $key){
					logd("child $key terminated\n");
					ipcEnter();
					delete($childrenPid{$key});
					$childrenCnt--;
					ipcLeave();
				}
			}
		}
	}
}
if ($i != 0){
	# run remaining command by children
	forkChild($wordInfo);
}

sub forkChild{
	my $info = $_[0];
	ipcEnter();
	$childrenCnt++;
	if (my $pid = fork()) {
		$childrenPid{$pid} = "dummy";
		ipcLeave();
		logd("forked $pid\n");
	} else {
		# child
		
		logd("cat $textFileName | grep -c $paramWords\n");
		my $lines;
		my $cnt = 0;
		if ($CNTONLY){
			$cnt = `cat $textFileName | grep -c $paramWords`;
			$cnt =~ s/\n//;
		}else{
			#$lines = `cat $textFileName | grep $paramWords`;
			#$cnt = (() = $lines =~ /\n/g);
			# `` will alloc memory for whole output, but open will do streaming
			# and it doesn't use many memory.
			open(CMD, "cat $textFileName | grep $paramWords |")
				|| die "failed to run grep\n";
			while(<CMD>){
				++$cnt;
				ipcEnter();
				print $_;
				ipcLeave();
			}
			close CMD;
		}
		
		# output
		$paramWords =~ s/-e //g;
		ipcEnter();
		print("words $paramWords count is $cnt");
		if (length($info) != 0){
			print(" : $info\n");
		}else{
			print("\n");
		}
		ipcLeave();

		exit(0);
	}
}

my $key;
my $value;
while (($key, $value) = each(%childrenPid)){
	logd("wait to terminate child $key\n");
	waitpid($key, 0);
}

cleanup();


# utilities

sub logd{
	#print $_[0];
}