kjs Library API Documentation

regexp.cpp

00001 // -*- c-basic-offset: 2 -*- 00002 /* 00003 * This file is part of the KDE libraries 00004 * Copyright (C) 1999-2001 Harri Porten (porten@kde.org) 00005 * 00006 * This library is free software; you can redistribute it and/or 00007 * modify it under the terms of the GNU Lesser General Public 00008 * License as published by the Free Software Foundation; either 00009 * version 2 of the License, or (at your option) any later version. 00010 * 00011 * This library is distributed in the hope that it will be useful, 00012 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00013 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 00014 * Lesser General Public License for more details. 00015 * 00016 * You should have received a copy of the GNU Lesser General Public 00017 * License along with this library; if not, write to the Free Software 00018 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 00019 * 00020 */ 00021 00022 #include "regexp.h" 00023 00024 #include "lexer.h" 00025 #include <stdio.h> 00026 #include <stdlib.h> 00027 #include <string.h> 00028 00029 using namespace KJS; 00030 00031 RegExp::RegExp(const UString &p, int f) 00032 : pat(p), flgs(f), m_notEmpty(false) 00033 { 00034 // JS regexps can contain Unicode escape sequences (\uxxxx) which 00035 // are rather uncommon elsewhere. As our regexp libs don't understand 00036 // them we do the unescaping ourselves internally. 00037 UString intern; 00038 if (p.find('\\') >= 0) { 00039 bool escape = false; 00040 for (int i = 0; i < p.size(); ++i) { 00041 UChar c = p[i]; 00042 if (escape) { 00043 escape = false; 00044 // we only care about \uxxxx 00045 if (c == 'u' && i + 4 < p.size()) { 00046 int c0 = p[i+1].unicode(); 00047 int c1 = p[i+2].unicode(); 00048 int c2 = p[i+3].unicode(); 00049 int c3 = p[i+4].unicode(); 00050 if (Lexer::isHexDigit(c0) && Lexer::isHexDigit(c1) && 00051 Lexer::isHexDigit(c2) && Lexer::isHexDigit(c3)) { 00052 c = Lexer::convertUnicode(c0, c1, c2, c3); 00053 intern += UString(&c, 1); 00054 i += 4; 00055 continue; 00056 } 00057 } 00058 intern += UString('\\'); 00059 intern += UString(&c, 1); 00060 } else { 00061 if (c == '\\') 00062 escape = true; 00063 else 00064 intern += UString(&c, 1); 00065 } 00066 } 00067 } else { 00068 intern = p; 00069 } 00070 00071 #ifdef HAVE_PCREPOSIX 00072 int pcreflags = 0; 00073 const char *perrormsg; 00074 int errorOffset; 00075 00076 if (flgs & IgnoreCase) 00077 pcreflags |= PCRE_CASELESS; 00078 00079 if (flgs & Multiline) 00080 pcreflags |= PCRE_MULTILINE; 00081 00082 pcregex = pcre_compile(intern.ascii(), pcreflags, 00083 &perrormsg, &errorOffset, NULL); 00084 #ifndef NDEBUG 00085 if (!pcregex) 00086 fprintf(stderr, "KJS: pcre_compile() failed with '%s'\n", perrormsg); 00087 #endif 00088 00089 #ifdef PCRE_INFO_CAPTURECOUNT 00090 // Get number of subpatterns that will be returned 00091 int rc = pcre_fullinfo( pcregex, NULL, PCRE_INFO_CAPTURECOUNT, &nrSubPatterns); 00092 if (rc != 0) 00093 #endif 00094 nrSubPatterns = 0; // fallback. We always need the first pair of offsets. 00095 00096 #else /* HAVE_PCREPOSIX */ 00097 00098 nrSubPatterns = 0; // determined in match() with POSIX regex. 00099 int regflags = 0; 00100 #ifdef REG_EXTENDED 00101 regflags |= REG_EXTENDED; 00102 #endif 00103 #ifdef REG_ICASE 00104 if ( f & IgnoreCase ) 00105 regflags |= REG_ICASE; 00106 #endif 00107 00108 //NOTE: Multiline is not feasible with POSIX regex. 00109 //if ( f & Multiline ) 00110 // ; 00111 // Note: the Global flag is already handled by RegExpProtoFunc::execute 00112 00113 if (regcomp(&preg, intern.ascii(), regflags) != 0) { 00114 /* TODO: throw JS exception */ 00115 regcomp(&preg, "", regflags); 00116 } 00117 #endif 00118 } 00119 00120 RegExp::~RegExp() 00121 { 00122 #ifdef HAVE_PCREPOSIX 00123 if (pcregex) 00124 pcre_free(pcregex); 00125 #else 00126 /* TODO: is this really okay after an error ? */ 00127 regfree(&preg); 00128 #endif 00129 } 00130 00131 UString RegExp::match(const UString &s, int i, int *pos, int **ovector) 00132 { 00133 if (i < 0) 00134 i = 0; 00135 if (ovector) 00136 *ovector = 0L; 00137 int dummyPos; 00138 if (!pos) 00139 pos = &dummyPos; 00140 *pos = -1; 00141 if (i > s.size() || s.isNull()) 00142 return UString::null; 00143 00144 #ifdef HAVE_PCREPOSIX 00145 CString buffer(s.cstring()); 00146 int bufferSize = buffer.size(); 00147 int ovecsize = (nrSubPatterns+1)*3; // see pcre docu 00148 if (ovector) *ovector = new int[ovecsize]; 00149 if (!pcregex) 00150 return UString::null; 00151 00152 if (pcre_exec(pcregex, NULL, buffer.c_str(), bufferSize, i, 00153 m_notEmpty ? (PCRE_NOTEMPTY | PCRE_ANCHORED) : 0, // see man pcretest 00154 ovector ? *ovector : 0L, ovecsize) == PCRE_ERROR_NOMATCH) 00155 { 00156 // Failed to match. 00157 if ((flgs & Global) && m_notEmpty && ovector) 00158 { 00159 // We set m_notEmpty ourselves, to look for a non-empty match 00160 // (see man pcretest or pcretest.c for details). 00161 // So we don't stop here, we want to try again at i+1. 00162 #ifndef NDEBUG 00163 fprintf(stderr, "No match after m_notEmpty. +1 and keep going.\n"); 00164 #endif 00165 m_notEmpty = 0; 00166 if (pcre_exec(pcregex, NULL, buffer.c_str(), bufferSize, i+1, 0, 00167 ovector ? *ovector : 0L, ovecsize) == PCRE_ERROR_NOMATCH) 00168 return UString::null; 00169 } 00170 else // done 00171 return UString::null; 00172 } 00173 00174 // Got a match, proceed with it. 00175 00176 if (!ovector) 00177 return UString::null; // don't rely on the return value if you pass ovector==0 00178 #else 00179 const uint maxMatch = 10; 00180 regmatch_t rmatch[maxMatch]; 00181 00182 char *str = strdup(s.ascii()); // TODO: why ??? 00183 if (regexec(&preg, str + i, maxMatch, rmatch, 0)) { 00184 free(str); 00185 return UString::null; 00186 } 00187 free(str); 00188 00189 if (!ovector) { 00190 *pos = rmatch[0].rm_so + i; 00191 return s.substr(rmatch[0].rm_so + i, rmatch[0].rm_eo - rmatch[0].rm_so); 00192 } 00193 00194 // map rmatch array to ovector used in PCRE case 00195 nrSubPatterns = 0; 00196 for(uint j = 1; j < maxMatch && rmatch[j].rm_so >= 0 ; j++) 00197 nrSubPatterns++; 00198 int ovecsize = (nrSubPatterns+1)*3; // see above 00199 *ovector = new int[ovecsize]; 00200 for (uint j = 0; j < nrSubPatterns + 1; j++) { 00201 if (j>maxMatch) 00202 break; 00203 (*ovector)[2*j] = rmatch[j].rm_so + i; 00204 (*ovector)[2*j+1] = rmatch[j].rm_eo + i; 00205 } 00206 #endif 00207 00208 *pos = (*ovector)[0]; 00209 #ifdef HAVE_PCREPOSIX // TODO check this stuff in non-pcre mode 00210 if ( *pos == (*ovector)[1] && (flgs & Global) ) 00211 { 00212 // empty match, next try will be with m_notEmpty=true 00213 m_notEmpty=true; 00214 } 00215 #endif 00216 return s.substr((*ovector)[0], (*ovector)[1] - (*ovector)[0]); 00217 } 00218 00219 #if 0 // unused 00220 bool RegExp::test(const UString &s, int) 00221 { 00222 #ifdef HAVE_PCREPOSIX 00223 int ovector[300]; 00224 CString buffer(s.cstring()); 00225 00226 if (s.isNull() || 00227 pcre_exec(pcregex, NULL, buffer.c_str(), buffer.size(), 0, 00228 0, ovector, 300) == PCRE_ERROR_NOMATCH) 00229 return false; 00230 else 00231 return true; 00232 00233 #else 00234 00235 char *str = strdup(s.ascii()); 00236 int r = regexec(&preg, str, 0, 0, 0); 00237 free(str); 00238 00239 return r == 0; 00240 #endif 00241 } 00242 #endif
KDE Logo
This file is part of the documentation for kjs Library Version 3.4.0.
Documentation copyright © 1996-2004 the KDE developers.
Generated on Thu Apr 14 00:18:53 2005 by doxygen 1.3.7 written by Dimitri van Heesch, © 1997-2003