diff --git a/icu4c/source/samples/ugrep/Makefile b/icu4c/source/samples/ugrep/Makefile new file mode 100644 index 00000000000..83b255fb8c0 --- /dev/null +++ b/icu4c/source/samples/ugrep/Makefile @@ -0,0 +1,4 @@ +# Copyright (c) 2002 IBM, Inc. and others +# sample code makefile + +# TODO diff --git a/icu4c/source/samples/ugrep/readme.txt b/icu4c/source/samples/ugrep/readme.txt new file mode 100644 index 00000000000..c9dd7d507a3 --- /dev/null +++ b/icu4c/source/samples/ugrep/readme.txt @@ -0,0 +1,69 @@ +ugrep: a sample program demonstrating the use of ICU regular expression API. + +usage: ugrep [options] pattern [file ...] + + --help Output a brief help message + -n, --line-number Prefix each line of output with the line number within its input file. + -V, --version Output the program version number + + +The program searches for the specified regular expression in each of the +specified files, and outputs each matching line. + +Input files are in the system default (locale dependent) encoding, unless they +begin with a BOM, in which case they are assumed to be in the UTF encoding +specified by the BOM. Program output is always in the system's default +8 bit code page. + + +Files: + ./ugrep.c source code for the sample + ./ugrep.dsw Windows MSVC workspace. Double-click this to get started. + ./ugrep.dsp Windows MSVC project file. + ./Makefile Makefile for Unixes. Needs gmake. + + +To Build ugrep on Windows + 1. Install and build ICU + 2. In MSVC, open the workspace file icu\samples\ugrep\ugrep.dsw + 3. Choose a Debug or Release build. + 4. Build. + +To Run on Windows + 1. Start a command shell window + 2. Add ICU's bin directory to the path, e.g. + set PATH=c:\icu\bin;%PATH% + (Use the path to where ever ICU is on your system.) + 3. cd into the ugrep directory, e.g. + cd c:\icu\source\samples\ugrep\debug + 4. Run it + ugrep ... + + +To Build on Unixes + 1. Build ICU. Specify an ICU install directory when running configure, + using the --prefix option. The steps to build ICU will look something + like this: + cd /source + runConfigureICU --prefix [other options] + gmake all + + 2. Install ICU, + gmake install + + 3. Build the sample + cd /source/samples/ugrep + export ICU_PREFIX= + gmake + + To Run on Unixes + cd /source/samples/ugrep + + export LD_LIBRARY_PATH=/lib:.:$LD_LIBRARY_PATH + ugrep ... + + + Note: The name of the LD_LIBRARY_PATH variable is different on some systems. + If in doubt, run the sample using "gmake check", and note the name of + the variable that is used there. LD_LIBRARY_PATH is the correct name + for Linux and Solaris. diff --git a/icu4c/source/samples/ugrep/ugrep.cpp b/icu4c/source/samples/ugrep/ugrep.cpp new file mode 100644 index 00000000000..87d560ab306 --- /dev/null +++ b/icu4c/source/samples/ugrep/ugrep.cpp @@ -0,0 +1,449 @@ +/************************************************************************** +* +* Copyright (C) 2002, International Business Machines +* Corporation and others. All Rights Reserved. +* +*************************************************************************** +*/ + +// +// ugrep - an ICU sample program illustrating the use of ICU Regular Expressions. +// +// The use of the ICU Regex API all occurs within the main() +// function. The rest of the code deals with with opening files, +// encoding conversions, printing results, etc. +// +// This is not a full-featured grep program. The command line options +// have been kept to a minimum to avoid complicating the sample code. +// + + + +#include +#include +#include + +#include "unicode/utypes.h" +#include "unicode/ustring.h" +#include "unicode/regex.h" +#include "unicode/ucnv.h" +#include "unicode/uclean.h" + + +// +// The following variables contain paramters that may be set from the command line. +// +const char *pattern = NULL; // The regular expression +int firstFileNum; // argv index of the first file name +UBool displayFileName = FALSE; +UBool displayLineNum = FALSE; + + +// +// Info regarding the file currently being processed +// +const char *fileName; +int fileLen; // Length, in UTF-16 Code Units. + +UChar *ucharBuf = 0; // Buffer, holds converted file. (Simple minded program, always reads + // the whole file at once. + +char *charBuf = 0; // Buffer, for original, unconverted file data. + + +// +// Info regarding the line currently being processed +// +int lineStart; // Index of first char of the current line in the file buffer +int lineEnd; // Index of char following the new line sequence for the current line +int lineNum; + +// +// Converter, used on output to convert Unicode data back to char * +// so that it will display in non-Unicode terminal windows. +// +UConverter *outConverter = 0; + +// +// Function forward declarations +// +void processOptions(int argc, const char **argv); +void nextLine(int start); +void printMatch(); +void printUsage(); +void readFile(const char *name); + + + +//------------------------------------------------------------------------------------------ +// +// main for ugrep +// +// Structurally, all use of the ICU Regular Expression API is in main(), +// and all of the supporting stuff necessary to make a running program, but +// not directly related to regular expressions, is factored out into these other +// functions. +// +//------------------------------------------------------------------------------------------ +int main(int argc, const char** argv) { + UBool matchFound = FALSE; + + // + // Process the commmand line options. + // + processOptions(argc, argv); + + // + // Create a RegexPattern object from the user supplied pattern string. + // + UErrorCode status = U_ZERO_ERROR; // All ICU operations report success or failure + // in a status variable. + + UParseError parseErr; // In the event of a syntax error in the regex pattern, + // this struct will contain the position of the + // error. + + RegexPattern *rePat = RegexPattern::compile(pattern, parseErr, status); + // Note that C++ is doing an automatic conversion + // of the (char *) pattern to a temporary + // UnicodeString object. + if (U_FAILURE(status)) { + fprintf(stderr, "ugrep: error in pattern: \"%s\" at position %d\n", + u_errorName(status), parseErr.offset); + exit(-1); + } + + // + // Create a RegexMatcher from the newly created pattern. + // + UnicodeString empty; + RegexMatcher *matcher = rePat->matcher(empty, status); + if (U_FAILURE(status)) { + fprintf(stderr, "ugrep: error in creating RegexMatcher: \"%s\"\n", + u_errorName(status)); + exit(-1); + } + + // + // Loop, processing each of the input files. + // + for (int fileNum=firstFileNum; fileNum < argc; fileNum++) { + readFile(argv[fileNum]); + + // + // Loop through the lines of a file, trying to match the regex pattern on each. + // + for (nextLine(0); lineStartreset(s); + if (matcher->find()) { + matchFound = TRUE; + printMatch(); + } + } + } + + // + // Clean up + // + delete matcher; + delete rePat; + free(ucharBuf); + free(charBuf); + ucnv_close(outConverter); + + u_cleanup(); // shut down ICU, release any cached data it owns. + + return matchFound? 0: 1; +} + + + +//------------------------------------------------------------------------------------------ +// +// doOptions Run through the command line options, and set +// the global variables accordingly. +// +// exit without returning if an error occured and +// ugrep should not proceed further. +// +//------------------------------------------------------------------------------------------ +void processOptions(int argc, const char **argv) { + int optInd; + UBool doUsage = FALSE; + UBool doVersion = FALSE; + const char *arg; + + + for(optInd = 1; optInd < argc; ++optInd) { + arg = argv[optInd]; + + /* version info */ + if(strcmp(arg, "-V") == 0 || strcmp(arg, "--version") == 0) { + doVersion = TRUE; + } + /* usage info */ + else if(strcmp(arg, "--help") == 0) { + doUsage = TRUE; + } + if(strcmp(arg, "-n") == 0 || strcmp(arg, "--line-number") == 0) { + displayLineNum = TRUE; + } + /* POSIX.1 says all arguments after -- are not options */ + else if(strcmp(arg, "--") == 0) { + /* skip the -- */ + ++optInd; + break; + } + /* unrecognized option */ + else if(strncmp(arg, "-", strlen("-")) == 0) { + printf("ugrep: invalid option -- %s\n", arg+1); + doUsage = TRUE; + } + /* done with options */ + else { + break; + } + } + + if (doUsage) { + printUsage(); + exit(0); + } + + if (doVersion) { + printf("ugrep version 0.01"); + if (optInd == argc) { + exit(0); + } + } + + int remainingArgs = argc-optInd; // pattern file ... + if (remainingArgs < 2) { + fprintf(stderr, "ugrep: files or pattern are missing.\n"); + printUsage(); + exit(1); + } + + if (remainingArgs > 2) { + // More than one file to be processed. Display file names with match output. + displayFileName = TRUE; + } + + pattern = argv[optInd]; + firstFileNum = optInd+1; +} + +//------------------------------------------------------------------------------------------ +// +// printUsage +// +//------------------------------------------------------------------------------------------ +void printUsage() { + printf("ugrep [options] pattern file...\n" + " -V or --version display version information\n" + " --help display this help and exit\n" + " -- stop further option processing\n" + "-n, --line-number Prefix each line of output with the line number within its input file.\n" + ); + exit(0); +} + +//------------------------------------------------------------------------------------------ +// +// readFile Read a file into memory, and convert it to Unicode. +// +// Since this is just a demo program, take the simple minded approach +// of always reading the whole file at once. No intelligent buffering +// is done. +// +//------------------------------------------------------------------------------------------ +void readFile(const char *name) { + + // + // Initialize global file variables + // + fileName = name; + fileLen = 0; // zero length prevents processing in case of errors. + + + // + // Open the file and determine its size. + // + FILE *file = fopen(name, "rb"); + if (file == 0 ) { + fprintf(stderr, "ugrep: Could not open file \"%s\"\n", fileName); + return; + } + fseek(file, 0, SEEK_END); + int rawFileLen = ftell(file); + fseek(file, 0, SEEK_SET); + + + // + // Read in the file + // + charBuf = (char *)realloc(charBuf, rawFileLen+1); // Need error checking... + int t = fread(charBuf, 1, rawFileLen, file); + if (t != rawFileLen) { + fprintf(stderr, "Error reading file \"%s\"\n", fileName); + return; + } + charBuf[rawFileLen]=0; + fclose(file); + + // + // Look for a Unicode Signature (BOM) in the data + // + int32_t signatureLength; + const char * charDataStart = charBuf; + UErrorCode status = U_ZERO_ERROR; + const char* encoding = ucnv_detectUnicodeSignature( + charDataStart, rawFileLen, &signatureLength, &status); + if (U_FAILURE(status)) { + fprintf(stderr, "ugrep: ICU Error \"%s\" from ucnv_detectUnicodeSignature()\n", + u_errorName(status)); + return; + } + if(encoding!=NULL ){ + charDataStart += signatureLength; + rawFileLen -= signatureLength; + } + + // + // Open a converter to take the file to UTF-16 + // + UConverter* conv; + conv = ucnv_open(encoding, &status); + if (U_FAILURE(status)) { + fprintf(stderr, "ugrep: ICU Error \"%s\" from ucnv_open()\n", u_errorName(status)); + return; + } + + // + // Convert the file data to UChar. + // Preflight first to determine required buffer size. + // + uint32_t destCap = ucnv_toUChars(conv, + NULL, // dest, + 0, // destCapacity, + charDataStart, + rawFileLen, + &status); + if (status != U_BUFFER_OVERFLOW_ERROR) { + fprintf(stderr, "ugrep: ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status)); + return; + }; + + status = U_ZERO_ERROR; + ucharBuf = (UChar *)realloc(ucharBuf, (destCap+1) * sizeof(UChar)); + ucnv_toUChars(conv, + ucharBuf, // dest, + destCap+1, + charDataStart, + rawFileLen, + &status); + if (U_FAILURE(status)) { + fprintf(stderr, "ugrep: ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status)); + return; + }; + ucnv_close(conv); + + // + // Successful conversion. Set the global size variables so that + // the rest of the processing will proceed for this file. + // + fileLen = destCap; +} + + + + + +//------------------------------------------------------------------------------------------ +// +// nextLine Advance the line index variables, starting at the +// specified position in the input file buffer, by +// scanning forwrd until the next end-of-line. +// +// Need to take into account all of the possible Unicode +// line ending sequences. +// +//------------------------------------------------------------------------------------------ +void nextLine(int startPos) { + if (startPos == 0) { + lineNum = 0; + } else { + lineNum++; + } + lineStart = lineEnd = startPos; + + for (;;) { + if (lineEnd >= fileLen) { + return; + } + UChar c = ucharBuf[lineEnd]; + lineEnd++; + if (c == 0x0a || // Line Feed + c == 0x0c || // Form Feed + c == 0x0d || // Carriage Return + c == 0x85 || // Next Line + c == 0x2028 || // Line Separator + c == 0x2029) // Paragraph separator + { + break; + } + } + + // Check for CR/LF sequence, and advance over the LF if we're in the middle of one. + if (lineEnd < fileLen && + ucharBuf[lineEnd-1] == 0x0d && + ucharBuf[lineEnd] == 0x0a) + { + lineEnd++; + } +} + + +//------------------------------------------------------------------------------------------ +// +// printMatch Called when a matching line has been located. +// Print out the line from the file with the match, after +// converting it back to the default code page. +// +//------------------------------------------------------------------------------------------ +void printMatch() { + char buf[2000]; + UErrorCode status = U_ZERO_ERROR; + + // If we haven't already created a converter for output, do it now. + if (outConverter == 0) { + outConverter = ucnv_open(NULL, &status); + if (U_FAILURE(status)) { + fprintf(stderr, "ugrep: Error opening default converter: \"%s\"\n", + u_errorName(status)); + exit(-1); + } + }; + + // Convert the line to be printed back to the default 8 bit code page. + // If the line is too long for our buffer, just truncate it. + ucnv_fromUChars(outConverter, + buf, // destination buffer for conversion + sizeof(buf), // capacity of destination buffer + &ucharBuf[lineStart], // Input to conversion + lineEnd-lineStart, // number of UChars to convert + &status); + buf[sizeof(buf)-1] = 0; // Add null for use in case of too long lines. + // The converter null-terminates its output unless + // the buffer completely fills. + + if (displayFileName) { + printf("%s:", fileName); + } + if (displayLineNum) { + printf("%d:", lineNum); + } + printf("%s", buf); +} + diff --git a/icu4c/source/samples/ugrep/ugrep.dsp b/icu4c/source/samples/ugrep/ugrep.dsp new file mode 100644 index 00000000000..e63fc05cb1c --- /dev/null +++ b/icu4c/source/samples/ugrep/ugrep.dsp @@ -0,0 +1,102 @@ +# Microsoft Developer Studio Project File - Name="ugrep" - Package Owner=<4> +# Microsoft Developer Studio Generated Build File, Format Version 6.00 +# ** DO NOT EDIT ** + +# TARGTYPE "Win32 (x86) Console Application" 0x0103 + +CFG=ugrep - Win32 Debug +!MESSAGE This is not a valid makefile. To build this project using NMAKE, +!MESSAGE use the Export Makefile command and run +!MESSAGE +!MESSAGE NMAKE /f "ugrep.mak". +!MESSAGE +!MESSAGE You can specify a configuration when running NMAKE +!MESSAGE by defining the macro CFG on the command line. For example: +!MESSAGE +!MESSAGE NMAKE /f "ugrep.mak" CFG="ugrep - Win32 Debug" +!MESSAGE +!MESSAGE Possible choices for configuration are: +!MESSAGE +!MESSAGE "ugrep - Win32 Release" (based on "Win32 (x86) Console Application") +!MESSAGE "ugrep - Win32 Debug" (based on "Win32 (x86) Console Application") +!MESSAGE + +# Begin Project +# PROP AllowPerConfigDependencies 0 +# PROP Scc_ProjName "" +# PROP Scc_LocalPath "" +CPP=cl.exe +RSC=rc.exe + +!IF "$(CFG)" == "ugrep - Win32 Release" + +# PROP BASE Use_MFC 0 +# PROP BASE Use_Debug_Libraries 0 +# PROP BASE Output_Dir "Release" +# PROP BASE Intermediate_Dir "Release" +# PROP BASE Target_Dir "" +# PROP Use_MFC 0 +# PROP Use_Debug_Libraries 0 +# PROP Output_Dir "Release" +# PROP Intermediate_Dir "Release" +# PROP Ignore_Export_Lib 0 +# PROP Target_Dir "" +# ADD BASE CPP /nologo /W3 /GX /O2 /D "WIN32" /D "NDEBUG" /D "_CONSOLE" /D "_MBCS" /YX /FD /c +# ADD CPP /nologo /MD /W3 /GX /O2 /I "..\..\..\include" /D "WIN32" /D "NDEBUG" /D "_CONSOLE" /D "_MBCS" /YX /FD /c +# ADD BASE RSC /l 0x409 /d "NDEBUG" +# ADD RSC /l 0x409 /d "NDEBUG" +BSC32=bscmake.exe +# ADD BASE BSC32 /nologo +# ADD BSC32 /nologo +LINK32=link.exe +# ADD BASE LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /machine:I386 +# ADD LINK32 kernel32.lib user32.lib /nologo /subsystem:console /machine:I386 + +!ELSEIF "$(CFG)" == "ugrep - Win32 Debug" + +# PROP BASE Use_MFC 0 +# PROP BASE Use_Debug_Libraries 1 +# PROP BASE Output_Dir "Debug" +# PROP BASE Intermediate_Dir "Debug" +# PROP BASE Target_Dir "" +# PROP Use_MFC 0 +# PROP Use_Debug_Libraries 1 +# PROP Output_Dir "Debug" +# PROP Intermediate_Dir "Debug" +# PROP Ignore_Export_Lib 0 +# PROP Target_Dir "" +# ADD BASE CPP /nologo /W3 /Gm /GX /ZI /Od /D "WIN32" /D "_DEBUG" /D "_CONSOLE" /D "_MBCS" /YX /FD /GZ /c +# ADD CPP /nologo /G6 /MDd /W3 /Gm /GX /ZI /Od /I "..\..\..\include" /D "WIN32" /D "_DEBUG" /D "_CONSOLE" /D "_MBCS" /YX /FD /GZ /c +# ADD BASE RSC /l 0x409 /d "_DEBUG" +# ADD RSC /l 0x409 /d "_DEBUG" +BSC32=bscmake.exe +# ADD BASE BSC32 /nologo +# ADD BSC32 /nologo +LINK32=link.exe +# ADD BASE LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /debug /machine:I386 /pdbtype:sept +# ADD LINK32 kernel32.lib user32.lib icuucd.lib icuind.lib /nologo /subsystem:console /debug /machine:I386 /pdbtype:sept /libpath:"../../../lib" + +!ENDIF + +# Begin Target + +# Name "ugrep - Win32 Release" +# Name "ugrep - Win32 Debug" +# Begin Group "Source Files" + +# PROP Default_Filter "cpp;c;cxx;rc;def;r;odl;idl;hpj;bat" +# Begin Source File + +SOURCE=.\ugrep.cpp +# End Source File +# End Group +# Begin Group "Header Files" + +# PROP Default_Filter "h;hpp;hxx;hm;inl" +# End Group +# Begin Group "Resource Files" + +# PROP Default_Filter "ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe" +# End Group +# End Target +# End Project diff --git a/icu4c/source/samples/ugrep/ugrep.dsw b/icu4c/source/samples/ugrep/ugrep.dsw new file mode 100644 index 00000000000..de33a2540e7 --- /dev/null +++ b/icu4c/source/samples/ugrep/ugrep.dsw @@ -0,0 +1,29 @@ +Microsoft Developer Studio Workspace File, Format Version 6.00 +# WARNING: DO NOT EDIT OR DELETE THIS WORKSPACE FILE! + +############################################################################### + +Project: "ugrep"=.\ugrep.dsp - Package Owner=<4> + +Package=<5> +{{{ +}}} + +Package=<4> +{{{ +}}} + +############################################################################### + +Global: + +Package=<5> +{{{ +}}} + +Package=<3> +{{{ +}}} + +############################################################################### +