Fix logic for Named Entity replacement

This will be improved further later on
This commit is contained in:
iska
2014-10-31 18:06:15 +01:00
parent 8a2422426e
commit bc3a7165ec
3 changed files with 34 additions and 18 deletions
+22 -18
View File
@@ -317,36 +317,40 @@
NSString *entityName = nil;
#warning Improve Named Entity Search
UTF32Char inputCharacter = [_inputStreamReader consumeNextInputCharacter];
NSArray *names = [HTMLTokenizerEntities entityNames];
// NSArray *names = [HTMLTokenizerEntities entityNames];
NSArray *names = NAMES();
NSMutableString *name = [NSMutableString stringWithString:StringFromUTF32Char(inputCharacter)];
NSUInteger searchIndex = 0;
while (YES) {
NSPredicate *predicate = [NSPredicate predicateWithFormat:@"SELF BEGINSWITH %@", name];
names = [names filteredArrayUsingPredicate:predicate];
if (names.count == 0) break;
searchIndex= [names indexOfObject:name
inSortedRange:NSMakeRange(searchIndex, names.count - searchIndex)
options:NSBinarySearchingInsertionIndex | NSBinarySearchingFirstEqual
usingComparator:^NSComparisonResult(id obj1, id obj2) {
return [obj1 compare:obj2];
}];
if (searchIndex >= names.count) break;
if ([[names objectAtIndex:searchIndex] isEqualToString:name]) {
entityName = [name copy];
}
if ([name hasSuffix:@";"]) break;
inputCharacter = [_inputStreamReader consumeNextInputCharacter];
if (inputCharacter == EOF) break;
[name appendString:StringFromUTF32Char(inputCharacter)];
if ([names containsObject:name]) {
entityName = [name copy];
if ([entityName hasSuffix:@";"]) {
break;
}
}
}
if (entityName == nil) {
if ([name hasSuffix:@";"]) {
[self emitParseError:@"Undefined named entity with semicolon found"];
} else {
NSString *nextAlphanumeric = [_inputStreamReader consumeAlphanumericCharacters];
if (nextAlphanumeric != nil) {
[name appendString:nextAlphanumeric];
}
[_inputStreamReader rewindToMarkedLocation];
if ([_inputStreamReader consumeAlphanumericCharacters] != nil) {
if ([_inputStreamReader consumeString:@";" caseSensitive:NO]) {
[self emitParseError:@"Undefined named entity with semicolon found"];
}
+2
View File
@@ -14,3 +14,5 @@
+ (NSString *)replacementForNamedCharacterEntity:(NSString *)entity;
@end
extern NSArray * NAMES();
+10
View File
@@ -2243,6 +2243,16 @@
static NSDictionary *_entities;
static NSString * x[] = {
#define NAMED_CHARACTER_REFERENCE( name, value ) @name,
NAMED_CHARACTER_REFERENCES
#undef NAMED_CHARACTER_REFERENCE
};
NSArray * NAMES() {
return [[NSArray alloc] initWithObjects:x count:2231];
}
@implementation HTMLTokenizerEntities
+ (void)initialize