ICU-22689 Add PPUCD-based data driven test for binary props

See #2889
This commit is contained in:
Elango Cheran 2024-03-19 18:37:43 +00:00
parent 7a3dfe877d
commit 1be861209e
3 changed files with 94 additions and 18 deletions

View file

@ -12,9 +12,11 @@
#include "unicode/putil.h"
#include "unicode/uscript.h"
#include "unicode/uset.h"
#include "charstr.h"
#include "cstring.h"
#include "hash.h"
#include "patternprops.h"
#include "ppucd.h"
#include "normalizer2impl.h"
#include "testutil.h"
#include "uparse.h"
@ -80,6 +82,7 @@ void UnicodeTest::runIndexedTest( int32_t index, UBool exec, const char* &name,
TESTCASE_AUTO(TestPropertyNames);
TESTCASE_AUTO(TestIDSUnaryOperator);
TESTCASE_AUTO(TestIDCompatMath);
TESTCASE_AUTO(TestBinaryPropertyUsingPpucd);
TESTCASE_AUTO_END;
}
@ -1024,3 +1027,93 @@ void UnicodeTest::TestIDCompatMath() {
assertTrue("idcmStart.contains(U+1D7C3)", idcmStart.contains(0x1D7C3));
assertFalse("idcmStart.contains(U+1D7C4)", idcmStart.contains(0x1D7C4));
}
U_NAMESPACE_BEGIN
class BuiltInPropertyNames : public PropertyNames {
public:
~BuiltInPropertyNames() override {}
int32_t getPropertyEnum(const char *name) const override {
return u_getPropertyEnum(name);
}
int32_t getPropertyValueEnum(int32_t property, const char *name) const override {
return u_getPropertyValueEnum((UProperty) property, name);
}
};
U_NAMESPACE_END
void UnicodeTest::TestBinaryPropertyUsingPpucd() {
IcuTestErrorCode errorCode(*this, "TestBinaryPropertyUsingPpucd()");
// Initialize PPUCD parsing object using file in repo and using
// property names present in built-in data in ICU
char buffer[500];
// get path to `source/data/unidata/` including trailing `/`
char *unidataPath = getUnidataPath(buffer);
if(unidataPath == nullptr) {
errln("exiting early because unable to open ppucd.txt from ICU source tree");
return;
}
CharString ppucdPath(unidataPath, errorCode);
ppucdPath.appendPathPart("ppucd.txt", errorCode);
PreparsedUCD ppucd(ppucdPath.data(), errorCode);
if(errorCode.isFailure()) {
errln("unable to open %s - %s\n",
ppucdPath.data(), errorCode.errorName());
return;
}
BuiltInPropertyNames builtInPropNames;
ppucd.setPropertyNames(&builtInPropNames);
// Define which binary properties we want to compare
constexpr UProperty propsUnderTest[] = {
UCHAR_IDS_UNARY_OPERATOR,
UCHAR_ID_COMPAT_MATH_START,
UCHAR_ID_COMPAT_MATH_CONTINUE,
};
// Allocate & initialize UnicodeSets per binary property from PPUCD data
UnicodeSet ppucdPropSets[std::size(propsUnderTest)];
// Iterate through PPUCD file, accumulating each line's data into each UnicodeSet per property
PreparsedUCD::LineType lineType;
UnicodeSet newValues;
while((lineType=ppucd.readLine(errorCode))!=PreparsedUCD::NO_LINE && errorCode.isSuccess()) {
if(ppucd.lineHasPropertyValues()) {
const UniProps *lineProps=ppucd.getProps(newValues, errorCode);
for(uint32_t i = 0; i < std::size(propsUnderTest); i++) {
UProperty prop = propsUnderTest[i];
if (!newValues.contains(prop)) {
continue;
}
if (lineProps->binProps[prop]) {
ppucdPropSets[i].add(lineProps->start, lineProps->end);
} else {
ppucdPropSets[i].remove(lineProps->start, lineProps->end);
}
}
}
}
if(errorCode.isFailure()) {
errln("exiting early due to parsing error");
return;
}
// Assert that the PPUCD data and the ICU data are equivalent for all properties
for(uint32_t i = 0; i < std::size(propsUnderTest); i++) {
UnicodeSet icuPropSet;
UProperty prop = propsUnderTest[i];
icuPropSet.applyIntPropertyValue(prop, 1, errorCode);
std::string msg =
std::string()
+ "ICU & PPUCD versions of property "
+ u_getPropertyName(prop, U_LONG_PROPERTY_NAME);
assertTrue(msg.c_str(), ppucdPropSets[i] == icuPropSet);
}
}

View file

@ -52,6 +52,7 @@ public:
void TestPropertyNames();
void TestIDSUnaryOperator();
void TestIDCompatMath();
void TestBinaryPropertyUsingPpucd();
private:

View file

@ -29,24 +29,6 @@ U_NAMESPACE_BEGIN
PropertyNames::~PropertyNames() {}
// TODO: Create a concrete subclass for the default PropertyNames implementation
// using the ICU library built-in property names API & data.
// Currently only the genprops tool uses PreparsedUCD, and provides its own
// PropertyNames implementation using its just-build property names data and its own code.
// At some point, we should use PreparsedUCD in tests, and then we will need the
// default implementation somewhere.
#if 0
int32_t
PropertyNames::getPropertyEnum(const char *name) const {
return u_getPropertyEnum(name);
}
int32_t
PropertyNames::getPropertyValueEnum(int32_t property, const char *name) const {
return u_getPropertyValueEnum((UProperty)property, name);
}
#endif
UniProps::UniProps()
: start(U_SENTINEL), end(U_SENTINEL),
bmg(U_SENTINEL), bpb(U_SENTINEL),