// counts the 'pre' and 'post' words found in pass_all // The pre-post words are what we find pre/post which // the 'remainder' makes up a word found in english-sorted.all #define _CRT_SECURE_NO_WARNINGS #include #include #include #define MAX_LEN 15 #define PASS_ALL_CNT 232000 #define DICT_CNT 1250000 #define DICT_FSIZE 14000000 #define PASS_ALL_FSIZE 2300000 #define DICT_FNAME "english-sorted.all" #define PASS_FNAME "pass_all" char *WordsPre[MAX_LEN+1][PASS_ALL_CNT]; char *WordsPost[MAX_LEN+1][PASS_ALL_CNT]; int nCntPreWords[MAX_LEN+1][PASS_ALL_CNT]; int nCntPostWords[MAX_LEN+16][PASS_ALL_CNT]; int nWordsPre[MAX_LEN+1]; int nWordsPost[MAX_LEN+1]; char *Passes[PASS_ALL_CNT]; int nPasses; char *Dict[DICT_CNT]; unsigned nDict; char *pBuf, *pBufOrig; bool SearchDict(const char *key) { int lo = 0; int num = nDict, hi = nDict-1; int mid, half, result; while (lo <= hi) { if ((half = num / 2) != 0) { mid = lo + (num & 1 ? half : (half - 1)); if (!(result = strcmp(key, Dict[mid]))) return true; else if (result < 0) { hi = mid; //num = num & 1 ? half : half-1; num = hi-lo+1; } else { if (lo==mid) ++lo; else lo = mid; //num = half; num = hi-lo+1; } } else if (num) return !strcmp(key, Dict[lo]); else break; } return false; } void main() { int i, j, k, len; char *cp; FILE *in; // we simply allocate ONE buffer, large enough for all file data, // and all possible prepend/append data. // the english-sorted.all used is just under 14mb. The pass_all is // just over 2.1mb, and the other sizes are the max count and lenght // of all theortical possible prepend and append stuff. Thus, we // assure ourselves of a large enough buffer, and thus do not need // to check for overflow issues. int alloc_sz = (3+4+5+6+7+8+9+10+11+12+13)*2*PASS_ALL_CNT + DICT_FSIZE + PASS_ALL_FSIZE; pBuf = new char[alloc_sz]; pBufOrig = pBuf; in = fopen(DICT_FNAME, "r"); len = fread(pBuf, 1, DICT_FSIZE, in); pBuf[len] = 0; fclose(in); cp = pBuf; pBuf += (strlen(pBuf) + 1); cp = strtok(cp, "\r\n"); while (cp) { Dict[nDict++] = cp; cp = strtok(NULL, "\r\n"); } in = fopen(PASS_FNAME, "r"); len = fread(pBuf, 1, PASS_ALL_FSIZE, in); pBuf[len] = 0; fclose(in); cp = pBuf; pBuf += (strlen(pBuf) + 1); cp = strtok(cp, "\r\n"); while (cp) { Passes[nPasses++] = cp; cp = strtok(NULL, "\r\n"); } for (i = 0; i < nPasses; ++i) { // nab characters off front of each password, then search // for an english word from the remaining characters. If // a search is found, then store the pre-pend chars into // the proper slot (well, SEARCH for the pre-pend prior to // adding a new entry. char pre[16]; // note max prepend will be 15 bytes. if (i && i%1000==0) fprintf(stderr, "%d post\r", i); int plen = strlen(Passes[i]); for (j = 1; j < plen-1 && j < MAX_LEN+1; ++j) { if (SearchDict(&Passes[i][j])) { // Found a 'pre' memcpy(pre, Passes[i], j); pre[j] = 0; for (k = 0; k < nWordsPre[j]; ++k) { if (!strcmp(WordsPre[j][k], pre)) { ++nCntPreWords[j][k]; *pre = 0; break; } } if (*pre) { strcpy(pBuf, pre); nCntPreWords[j][nWordsPre[j]] = 1; WordsPre[j][nWordsPre[j]++] = pBuf; pBuf += (j+1); } } } } for (i = 0; i < nPasses; ++i) { char post[512]; // note max postpend will be 15 bytes. int len = strlen(Passes[i]); j = 1; if (i && i%1000==0) fprintf(stderr, "%d post\r", i); if (len > 16) j = len-15; memcpy(post, Passes[i], j); for (; j < len-2; ++j) { post[j-1] = Passes[i][j-1]; post[j] = 0; if (SearchDict(post)) { // Found a 'post' bool bkeep=true; for (k = 0; k < nWordsPost[len-j]; ++k) { if (!strcmp(WordsPost[len-j][k], &Passes[i][j])) { ++nCntPostWords[len-j][k]; bkeep = false; break; } } if (bkeep) { strcpy(pBuf, &Passes[i][j]); nCntPostWords[len-j][nWordsPost[len-j]] = 1; WordsPost[len-j][nWordsPost[len-j]++] = pBuf; pBuf += (len-j)+1; } } } } for (i = 1; i < 16; ++i) { // printf ("nWordsPre[%d] = %d\n", i, nWordsPre[i]); for (j = 0; j < nWordsPre[i]; ++j) if (nCntPreWords[i][j] > 10) printf ("%05d `%s` (Pre)\n", nCntPreWords[i][j], WordsPre[i][j]); } for (i = 1; i < 16; ++i) { // printf ("nWordsPost[%d] = %d\n", i, nWordsPost[i]); for (j = 0; j < nWordsPost[i]; ++j) if (nCntPostWords[i][j] > 10) printf ("%05d `%s` (Post)\n", nCntPostWords[i][j], WordsPost[i][j]); } }