diff --git a/src/idTagDetector.py b/src/idTagDetector.py index 1c40d94..111504a 100644 --- a/src/idTagDetector.py +++ b/src/idTagDetector.py @@ -1,91 +1,107 @@ # takes a string as input. Detects SVG tags susceptible of designating geographical divisions # such as continents, countries, administrative subdivitions, etc. #from bs4 import BeautifulSoup import re class IdTagDetector: def __init__(self, arg): self.arg=arg self.detectedTags=[] self.stoplist=[] self.stoplistPath="../configs/stoplists" self.whitelist=[] self.whitelistPath="../configs/whitelists" # The prefixes that have been observed to hold geographic labels in the test maps - self.labelPrefixes = ["path class=", "inkscape:label=", "label=", "id="] + #self.labelPrefixes = ["path class", "label", "label", "id"] + self.labelPrefixes = ["path class", "inkscape:label", "label", "id"] - # + self.regex = "\s*=\s*\"([^0-9].+?)\"" + + # TODO: implements this def loadStopList(self): return [] - # + # TODO: implements this def loadWhiteList(self): return [] - # + # Just to print what has been found, fairly trivial def listTags(self): print(self.detectedTags) - # + # Seeks prefixes as listed in "self.labelPrefixes", and retrieves following srings if not made entirely of numbers. def detect(self): - candidates = [] + candidates = set() for labelPrefix in self.labelPrefixes: - regex = labelPrefix+"\"([^0-9]+?)\"" - candidates.extend( re.findall(regex, self.arg) ) - + for i in ( re.findall(labelPrefix+self.regex, self.arg) ): + if not re.match("(path|svg|g|clipPath|rect|stop|style|metadata|title|defs|linearGradient)\d+", i): + candidates.add(i) self.detectedTags = candidates + + # Return a list of prefixes most likely used in this SVG document to store IDs. + def detectedPrefixes(self): + usedPrefixes = [] + for labelPrefix in self.labelPrefixes: + if len( re.findall(labelPrefix+self.regex, self.arg) ) > 0: + usedPrefixes.append(labelPrefix) + return usedPrefixes + + + # Just to return what has been found, fairly trivial + #def detectedTags(self): + #return self.detectedTags #subdivisionTag = "path" #labelTag = "inkscape:label" #soup = BeautifulSoup(self.arg) #candidates = soup.findAll(subdivisionTag) #for candidate in candidates: #print(candidate) ##if candidate[labelTag][0] == "#": ##break ##if candidate.has_attr(labelTag): ##self.detectedTags.append(candidate[labelTag]) ##self.detectedTags.append(candidate[labelTag]) ##################################################################### ##################################################################### ############################################### #../maps/Blank_Map_Africa_1932.svg # # ==> tn ############################################### #../maps/USA_Counties_with_FIPS_and_names.svg # # ==> Teton, WY ############################################### #../maps/Blank_map_of_Europe_1815.svg # # ==> Serbia ############################################### #../maps/World98.svg # # # #==> Iran:Semnan Province ##################################################################### # path class="land tn" # inkscape:label="Teton, WY" # id="Serbia" #