知识大全 JAVA版StarDict星际译王简单实现

Posted 2022-07-19 字节

篇首语：家资是何物，积帙列梁梠。本文由小常识网(cha138.com)小编为大家整理，主要介绍了知识大全 JAVA版StarDict星际译王简单实现相关的知识，希望对你有一定的参考价值。

JAVA版StarDict星际译王简单实现 以下文字资料是由(全榜网网www.cha138.com)小编为大家搜集整理后发布的内容，让我们赶快一起来看一下吧！

　　由胡正开发的星际译王是Linux平台上很强大的一个开源的翻译软件（也有Windows版本的）支持多种词库多种语言版本尤其词库设计比较合理之前看到一篇博文《星际译王词库应用自制英汉词典》中用简短的程序就实现了词典的基本功能不过那个是Linux 下的C/C++版本的于是决定参考移植一个JAVA版本　　import java io ByteArrayOutputStream; 　　import java io IOException; 　　import java io InputStream; 　　import java io *; 　　/** 　　* @docRoot 　　* Java版词典测试版可以在控制台下输入要查询的单词回车后会给出单词在词典中的释义　　* 词典采用星际译王的词典本程序主要针对英汉词典　　* 　　* @author menglongbor 　　* @updateDate 　　* @version v 　　* 　　* 相关参考链接　　* l 　　* _zhu_xiang/item/ f be e eee 　　* 　　* 　　* 　　* /downloads/list 　　* 　　*/ 　　public class testdict 　　　　final static intMAX_WORD= ;// 最长输入单词字符数　　final static intMAX_KEYS= ;// 个字母+ 开头的后缀　　final static intSIZEINT= ; 　　final static StringKEY[]= // 个字母索引+ 开头的后缀不区分大小写　　 A b c d e f 　　 g h i j k l m n o p q r s 　　 t u v w x y z ; 　　public static InputStreamisidx= null;// 读取idx文件时所要的流　　public static InputStreamisdict= null;// 读取dict文件时所要的流　　public static longSTREAM_LOCAL= ;// 记录单词索引在文件流中的位置　　public static StringidxfileString= oxford gb idx ;// idx文件路径　　public static StringdictfileString= oxford gb dict ;// dict文件路径　　/** 　　* 从idx文件中获取当前目标单词　　* @param word_buf 保存的是c/c++字符串数组转换为JAVA字符串　　* @param data_poffset 用来保存单词的data偏移位置信息　　* @param data_plength 用来保存单词的data长度信息　　* @param len 　　* @return 　　*/ 　　public static boolean get_word（String[] word_buf int[] data_poffset 　　int[] data_plength int[] len）　　　　// int len = ; 　　boolean flag = true; 　　len[ ] = ; 　　int index = ; 　　byte wd[] = new byte[MAX_WORD]; 　　int value = ; 　　try 　　　　// 读取单词对每个字母开头的单词都进行搜索最多考虑个字符的单词　　// 读到单词结束符\\ 时赋值表达式的值就不满足while条件而退出　　while （true）　　　　index = isidx read（）　　STREAM_LOCAL++;// 每读取一次位置标识加一以记录下单词在文件流中的起始位置　　if （index == ）　　　　// isidx reset（）　　flag = false; 　　break; 　　　　if （（index != ） && （len[ ] < MAX_WORD））　　　　wd[len[ ]] = （byte） index;// 将int转换为byte 　　len[ ]++; 　　 else 　　　　break; 　　　　　　// 转换为JAVA字符串　　// 此处不用再需要像c/c++那样去掉了最后那个结束符了　　byte wd [] = new byte[len[ ]]; 　　for （int i = ; i < len[ ]; i++）　　　　wd [i] = wd[i]; 　　　　word_buf[ ] = new String（wd ）　　// System out println（ get_word: +word_buf[ ]+ len: +len[ ]）　　// wd = null;// 释放内存　　// wd = null; 　　// 读取偏移量值　　for （int i = ; i < SIZEINT; i++）　　　　// 将个byte转换为int 　　int shift = （ i） * ; 　　index = isidx read（）　　STREAM_LOCAL++;// 每读取一次位置标识加一以记录下单词在文件流中的起始位置　　if （index == ）　　　　// isidx reset（）　　flag = false; 　　return flag; 　　　　value += （index & x FF）《 shift; 　　　　data_poffset[ ] = value; 　　// 读取区块大小值　　value = ; 　　for （int i = ; i < SIZEINT; i++）　　　　// 将个byte转换为int 　　int shift = （ i） * ; 　　index = isidx read（）　　STREAM_LOCAL++;// 每读取一次位置标识加一以记录下单词在文件流中的起始位置　　if （index == ）　　　　// isidx reset（）　　flag = false; 　　return flag; 　　　　value += （index & x FF）《 shift; 　　　　data_plength[ ] = value; 　　　　catch （Exception e）　　　　System out println（ idx file read error! ）　　　　// System out println（ Now local is: +STREAM_LOCAL）　　// 得到单词字符长度　　return flag; 　　　　/** 　　* 通过偏移位置offset和长度length 来从dict文件中获取data内容UTF 编码的字符　　* @param offset 要读取的内容的起始偏移为字节数　　* @param length 要读取的内容的数据块大小为字节数　　* @return 字节数组的data int 　　*/ 　　public static byte[] get_data（int[] offset int[] length）　　　　long oft = offset[ ]; 　　long len = length[ ]; 　　long skip; 　　byte data_buf[] = new byte[length[ ]]; 　　System out println（ This word s + offset: + offset[ ] + len: 　　+ length[ ]）　　try 　　　　isdict reset（）　　long valuedata = isdict available（）　　if （valuedata < oft + len）　　　　System out println（ No so much value data! + valuedata）　　　　// skip=isdict skip（oft）　　skip = skipBytesFromStream（isdict oft）　　if （skip != oft）　　　　System out println（ Skip + skip + dict file error! ）　　　　if （isdict read（data_buf） == ）　　　　System out println（ Arrive at the end of file! ）　　　　// // Unicode 　　// StringBuffer sb = new StringBuffer（）　　// 　　// int size =isdict read（data_buf）　　// 　　// for （int j = ; j < size;）　　// 　　// 　　// int l = data_buf[j++]; 　　// 　　// int h = data_buf[j++]; 　　// 　　// char c = （char）（（l & xff） | （（h 《） & xff ））　　// 　　// sb append（c）　　// 　　// 　　// 　　// // return sb toString（）　　　　catch （Exception e）　　　　data_buf = null; 　　System out println（ dict file read error! ）　　e printStackTrace（）　　　　if （data_buf == null）　　　　return null; 　　　　return data_buf; 　　　　/** 　　* utf 解码参考自用法　　* 假如 newContent 为UTF 编码的字符串 byte[] b = newContent getBytes（） newContent = 　　* URLEncoder UTF Decode（ b b length ）　　* @param in 要进行解码的UTF 编码的字节数组　　* @param offset 　　* @param length 　　* @return 　　*/ 　　public static String UTF Decode（byte in[] int offset int length）　　　　StringBuffer buff = new StringBuffer（）　　int max = offset + length; 　　for （int i = offset; i < max; i++）　　　　char c = ; 　　if （（in[i] & x ） == ）　　　　c = （char） in[i]; 　　 else if （（in[i] & xe ） == xc ） // 　　　　c |= （（in[i] & x f）《） // 　　i++; 　　c |= （（in[i] & x f）《） // 　　 else if （（in[i] & xf ） == xe ） // 　　　　c |= （（in[i] & x f）《） // 　　i++; 　　c |= （（in[i] & x f）《） // 　　i++; 　　c |= （（in[i] & x f）《） // 　　 else if （（in[i] & xf ） == xf ） // 　　　　c |= （（in[i] & x ）《） // （move not ）　　i++; 　　c |= （（in[i] & x f）《） // 　　i++; 　　c |= （（in[i] & x f）《） // 　　i++; 　　c |= （（in[i] & x f）《） // 　　 else 　　　　c = ; 　　　　buff append（c）　　　　return buff toString（）　　　　public static byte[] UTF Encode（String str）　　　　ByteArrayOutputStream bos = new ByteArrayOutputStream（）　　try 　　　　int strlen = str length（）　　for （int i = ; i < strlen; i++）　　　　char t = str charAt（i）　　int c = ; 　　c |= （t & xffff）　　if （c >= && c < x ）　　　　bos write（（byte）（c & xff））　　 else if （c > x f && c < x ）　　　　bos write（（byte）（（（c >>> ） & x f） | xc ））　　bos write（（byte）（（（c >>> ） & x f） | x ））　　 else if （c > x ff && c < x ）　　　　bos write（（byte）（（（c >>> ） & x f） | xe ）） // < 　　// correction 　　// （mb）　　bos write（（byte）（（（c >>> ） & x f） | x ））　　bos write（（byte）（（（c >>> ） & x f） | x ））　　 else if （c > x ffff && c < xfffff）　　　　bos write（（byte）（（（c >>> ） & x ） | xf ））　　bos write（（byte）（（（c >>> ） & x f） | x ））　　bos write（（byte）（（（c >>> ） & x f） | x ））　　bos write（（byte）（（（c >>> ） & x f） | x ））　　　　　　bos flush（）　　　　catch （Exception e）　　　　　　return bos toByteArray（）　　　　/** 　　* 将UTF 字节数据转化为Unicode字符串　　* 　　* @param utf_data 　　* byte[] UTF 编码字节数组　　* @param len 　　* int 字节数组长度　　* @return String 变换后的Unicode编码字符串　　*/ 　　public static String UTF Uni（byte[] utf_data int len）　　　　StringBuffer unis = new StringBuffer（）　　char unic = ; 　　int ptr = ; 　　int cntBits = ; 　　for （ ptr < len;）　　　　cntBits = getCntBits（utf_data[ptr]）　　if （cntBits == ）　　　　++ptr; 　　continue; 　　 else if （cntBits == ）　　　　unic = UTFC UniC（utf_data ptr cntBits）　　++ptr; 　　 else 　　　　unic = UTFC UniC（utf_data ptr cntBits）　　ptr += cntBits; 　　　　unis append（unic）　　　　return unis toString（）　　　　/** 　　* 将指定的UTF 字节组合成一个Unicode编码字符　　* @param utf byte[] UTF 字节数组　　* @param sptr int 编码字节起始位置　　* @param cntBits int 编码字节数　　* @return char 变换后的Unicode字符　　*/ 　　public static char UTFC UniC（byte[] utf int sptr int cntBits）　　　　/* 　　* Unicode < > UTF U U F: xxxxxxx U 　　* U FF: xxxxx xxxxxx U U FFFF: xxxx 　　* xxxxxx xxxxxx U U FFFFF: xxx xxxxxx xxxxxx 　　* xxxxxx U U FFFFFF: xx xxxxxx xxxxxx xxxxxx 　　* xxxxxx U U FFFFFFF: x xxxxxx xxxxxx xxxxxx 　　* xxxxxx xxxxxx 　　*/ 　　int uniC = ; // represent the unicode char 　　byte firstByte = utf[sptr]; 　　int ptr = ; // pointer ~ 　　// resolve single byte UTF encoding char 　　if （cntBits == ）　　return （char） firstByte; 　　// resolve the first byte 　　firstByte &= （《（ cntBits）） ; 　　// resolve multiple bytes UTF encoding char（except the first byte）　　for （int i = sptr + cntBits ; i > sptr; i）　　　　byte utfb = utf[i]; 　　uniC |= （utfb & x f）《 ptr; 　　ptr += ; 　　　　uniC |= firstByte 《 ptr; 　　return （char） uniC; 　　　　/** 　　* 根据给定字节计算UTF 编码的一个字符所占字节数 UTF 规则定义字节标记只能为或 ~ 　　* @param b 　　* @return 　　*/ 　　private static int getCntBits（byte b）　　　　int cnt = ; 　　if （b == ）　　return ; 　　for （int i = ; i >= ; i）　　　　if （（（b 》 i） & x ） == ）　　++cnt; 　　else 　　break; 　　　　return （cnt > || cnt == ） : cnt; 　　　　/** 　　* 显示data内容　　* @param data_buf UTF 的单词释义数组　　* @param data_length UTF 的单词释义数组长度　　*/ 　　public static void display_data（byte[] data_buf int data_length[]）　　　　// 将UTF byte字节数组转为当前环境字符并显示　　// String tempString = UTF Decode（data_buf data_length[ ]）　　String tempString = UTF Uni（data_buf data_length[ ]）　　// String tempString = new String（data_buf）　　data_buf = null; 　　System out println（tempString）　　　　/** 　　* 从idx文件中搜索由word指定的单词并保存相应的偏移和长度信息　　* @param word 　　* @param data_poffset 　　* @param data_plength 　　* @return 是否搜索成功　　*/ 　　public static boolean search_word（String word int[] data_poffset 　　int[] data_plength）　　　　String wd[] = new String[ ]; 　　boolean temp = false; 　　int len[] = new int[ ]; 　　// 从idx文件中获取当前目标单词　　// for （get_word（wd data_poffset data_plength） end; get_word（wd 　　// data_poffset data_plength））　　// 　　while （get_word（wd data_poffset data_plength len））　　　　// System out println（ pared_word: +wd[ ]）　　// if （wd[pareToIgnoreCase（word） == ） // 　　// 比较字符串s 和s 但不区分字母的大小写　　if （strsEqualsIgnoreCase（wd[ ] word） == ）　　　　System out println（ pared_word: + word + + wd[ ]）　　temp = true; 　　break; 　　　　　　return temp; 　　　　/** 　　* 从标准输入获取待查询的单词控制台下为GBK字符字典索引中的英文单词字母也是如此　　* @param max_len 　　* @param count 　　* @return 　　*/ 　　public static String get_input（int max_len int[] count）　　　　byte input_buf[] = new byte[max_len]; 　　count[ ] = ; 　　String tempString[] = new String[ ]; 　　try 　　　　count[ ] = System in read（input_buf） ;// 返回实际读取到的字符数减去个控制字符　　byte temp_buf[] = new byte[count[ ]]; 　　for （int i = ; i < count[ ]; i++）　　　　temp_buf[i] = input_buf[i]; 　　　　tempString[ ] = new String（temp_buf）　　　　catch （Exception e）　　　　System out println（ Input error! ）　　　　System out println（ Your input is: + tempString[ ]）　　return tempString[ ]; 　　　　/** 　　* 从标准输入获取待查询的单词控制台下为GBK字符字典索引中的英文单词字母也是如此　　* @param input_buf 　　* @param count 　　* @return 　　*/ 　　public static byte[] get_input（byte[] input_buf int[] count）　　　　try 　　　　count[ ] = System in read（input_buf） ;// 返回实际读取到的字符数减去个控制字符　　　　catch （Exception e）　　　　input_buf = null; 　　System out println（ Input error! ）　　　　return input_buf; 　　　　/** 　　* 缓存KEYS在idx中的偏移信息以便加快search_word的搜索速度　　* @param idx_cache 保存每个单字母单词对应的起始位置　　* @return 　　*/ 　　public static void cache_idx（long[] idx_cache）　　　　int i; 　　long[] p = idx_cache; 　　int unused [] = new int[ ]; 　　int unused [] = new int[ ]; 　　try 　　　　// 将文件内部的位置指针重新指向一个流（数据流/文件）的开头返回FILE指针当前位置　　// 然后重新遍历整个文件搜寻下一个字母开头的单词　　isidx reset（）　　STREAM_LOCAL = ; 　　for （i = ; i < MAX_KEYS; i++）　　　　// System out println（ Start search_word: + KEY[i]）　　if （search_word（KEY[i] unused unused ））// 从idx文件中搜索由word指定的单词并保存相应的偏移和长度信息　　　　p[i] = STREAM_LOCAL; // 返回当前文件位置　　// String tempString = Long toString（STREAM_LOCAL）　　// System out println（KEY[i] + s local is: + tempString）　　System out println（KEY[i] + s local is: + STREAM_LOCAL 　　+ offset: + unused [ ] + length: + unused [ ]）　　 else 　　p[i] = ; 　　　　// isidx reset（）　　　　catch （Exception e）　　　　// TODO: handle exception 　　　　　　/** 　　* 定位由word指定的单词在idx文件中的大概偏移位置　　* @param word 　　* @param idx_cache 　　* @return 　　*/ 　　public static long locate_idx（String word long[] idx_cache）　　　　int i = ; 　　int pre = ; 　　String tempString = word toLowerCase（）　　while （i < MAX_KEYS && KEY[i] charAt（） < tempString charAt（））　　　　pre = i; 　　++i; 　　　　if （tempString charAt（） == ）　　　　pre = ; 　　　　System out println（ Now word s locate is: + idx_cache[pre]）　　return idx_cache[pre]; 　　　　/** 　　* 主要查询函数　　*/ 　　public static void consult（）　　　　byte data[] = null;// 释义数据 UTF 数据　　long idx[] = new long[MAX_KEYS];// 个字母孤立单词+ 开头的后缀对应的索引缓冲　　int offset[] = new int[ ]; 　　int length[] = new int[ ]; 　　System out println（ Start cache_idx…！）　　try 　　　　System out println（ Open files…！）　　// 读取字典索引文件　　isidx = new BufferedInputStream（new FileInputStream（　　idxfileString））　　isidx mark（isidx available（） + ）　　if （！isidx markSupported（））　　　　System out println（ This stream do not support mark…！）　　　　　　catch （Exception e）　　　　System out println（ Open files error! ）　　e printStackTrace（）　　　　cache_idx（idx） // 缓存KEYS在idx中的偏移信息以便加快search_word的搜索速度　　try 　　　　isdict = new BufferedInputStream（new FileInputStream（　　dictfileString））　　isdict mark（isdict available（） + ）　　if （！isdict markSupported（））　　　　System out println（ This stream do not support mark…！）　　　　　　catch （Exception e）　　　　System out println（ Open files error! ）　　e printStackTrace（）　　　　while （true）　　　　System out println（ INPUT A WORD OR PHRASE: ）　　int count[] = new int[ ]; 　　String word = get_input（MAX_WORD count）　　long skips skips ; 　　if （count[ ] > ）// 从控制台得到输入单词字符　　　　try 　　　　// 从文件开头跳到单词大致索引所在位置　　// isidx mark（）　　isidx reset（）　　skips = locate_idx（word idx）　　// skips = isidx skip（skips ）　　skips = skipBytesFromStream（isidx skips ）　　System out 　　 println（ skips : + skips + skips : + skips ）　　　　catch （Exception e）　　　　System out println（ locate_idx run error ）　　e printStackTrace（）　　　　if （search_word（word offset length））　　　　data = get_data（offset length）　　display_data（data length）　　data = null; 　　 else 　　System out println（ SORRY + word + CANNOT BE FOUND!\\n ）　　System out 　　 println（ \\n \\n\\n ）　　 else 　　break; 　　　　　　/** 　　* 不区分大小写比较两个字符串　　* 　　* @param s 　　* @param s 　　* @return 　　*/ 　　public static int strsEqualsIgnoreCase（String s String s ）　　　　int n = s length（） n = s length（）　　for （int i = i = ; i < n && i < n ; i ++ i ++）　　　　char c = s charAt（i ）　　char c = s charAt（i ）　　if （c != c ）　　　　// 源字符串全部都转为大写字符串　　c = Character toUpperCase（c ）　　c = Character toUpperCase（c ）　　if （c != c ）　　　　// 源字符串全部都转为小写字符串　　c = Character toLowerCase（c ）　　c = Character toLowerCase（c ）　　if （c != c ）　　　　return c c ; 　　　　　　　　　　return n n ;// 如果其中一个或者两个String都比较完了还没有同样的char的话那就return两个String的长度差距　　　　/** 　　* 重写了Inpustream 中的skip（long n）方法将数据流中起始的n 个字节跳过　　* 参考　　* @param inputStream 　　* @param n 　　* @return 　　*/ 　　private static long skipBytesFromStream（InputStream inputStream long n）　　　　long remaining = n; // SKIP_BUFFER_SIZE is used to determine the size of 　　// skipBuffer 　　int SKIP_BUFFER_SIZE = ; // skipBuffer is initialized in 　　// skip（long） if needed 　　byte[] skipBuffer = null; 　　int nr = ; 　　if （skipBuffer == null）　　　　skipBuffer = new byte[SKIP_BUFFER_SIZE]; 　　　　byte[] localSkipBuffer = skipBuffer; 　　if （n <= ）　　　　return ; 　　　　while （remaining > ）　　　　try 　　　　nr = inputStream read（localSkipBuffer （int） Math min（　　SKIP_BUFFER_SIZE remaining））　　　　catch （IOException e）　　　　e printStackTrace（）　　　　if （nr < ）　　　　break; 　　　　remaining = nr; 　　　　return n remaining; 　　　　/** 　　* 主函数　　* @param args 　　*/ 　　public static void main（String args[]）　　　　consult（）　　try 　　　　isidx close（）　　isdict close（）　　　　catch （Exception e）　　　　System out println（ Close files error! ）　　e printStackTrace（）　　　　　　　　如果要在windows平台下编译 l文章中的程序代码最好保存为cpp文件以C++项目编译执行而且strcasecmp函数应该换为stricmp函数并且上面作者原来的程序是在linux平台下的字符编码本身就是UTF 的不需要进行编码转换但在windows平台下中文为gb 编码就需要进行编码的转换下面为需要添加修改上的字符编码转换后的程序　　//UTF 到GB 的转换　　char* U G（const char* utf ）　　　　int len = MultiByteToWideChar（CP_UTF utf NULL ）　　wchar_t* wstr = new wchar_t[len+ ]; 　　memset（wstr len+ ）　　MultiByteToWideChar（CP_UTF utf wstr len）　　len = WideCharToMultiByte（CP_ACP wstr NULL NULL NULL）　　char* str = new char[len+ ]; 　　memset（str len+ ）　　WideCharToMultiByte（CP_ACP wstr str len NULL NULL）　　if（wstr） delete[] wstr; 　　return str; 　　　　//GB 到UTF 的转换　　char* G U（const char* gb ）　　　　int len = MultiByteToWideChar（CP_ACP gb NULL ）　　wchar_t* wstr = new wchar_t[len+ ]; 　　memset（wstr len+ ）　　MultiByteToWideChar（CP_ACP gb wstr len）　　len = WideCharToMultiByte（CP_UTF wstr NULL NULL NULL）　　char* str = new char[len+ ]; 　　memset（str len+ ）　　WideCharToMultiByte（CP_UTF wstr str len NULL NULL）　　if（wstr） delete[] wstr; 　　return str; 　　　　/* 　　* 显示data内容　　*/ 　　void display_data（char *data_buf unsigned int data_length）　　　　fwrite（data_buf data_length stdout）　　char *data=（char *）malloc（data_length）　　memcpy（data data_buf data_length）　　char *p=U G（data_buf）　　printf（ %s\\n p）　　free（data）　　delete p; 　　　　以星际译王所支持的牛津英汉词典oxford gb作为测试词典格式为UTF 编码的单词字符串然后是四个字节的int型数据表示该单词在dict释义文件中的起始偏移量再后四个字节的int型数据表示dict文件中该单词释义总共的长度如下图所示结果显示能够正确得到单词的释义只是音标未能正确解码如下图所示 cha138/Article/program/Java/hx/201311/25526