# make sure ES is up and running
# import requests
# res = requests.get('http://localhost:9200')
# print(res.content)

#connect to our cluster
# from elasticsearch import Elasticsearch
# es = Elasticsearch([{'host': 'localhost', 'port': 9200}])


from datetime import datetime
from elasticsearch import Elasticsearch
import codecs

es = Elasticsearch()

es.indices.delete(index='git-log1', ignore=[400, 404])

es.indices.create(
  index='git-log1',
  body={
    "settings": {
      "analysis": {
        "filter": {
          "english_stop": {
            "type":       "stop",
            "stopwords":  "_english_"
          },
          "english_keywords": {
            "type":       "keyword_marker",
            "keywords":   ["[TEST]", "[test]"]
          },
          "english_stemmer": {
            "type":       "stemmer",
            "language":   "english"
          },
          "english_possessive_stemmer": {
            "type":       "stemmer",
            "language":   "possessive_english"
          }
        },
        "analyzer": {
          "english_noStemm": {
            "tokenizer":  "standard",
            "filter": [
              "english_possessive_stemmer",
              "lowercase",
              "english_stop",
              "english_keywords"
            ]
          }
        }
      }
    },
    'mappings': {
      'git_log': {
        'properties': {
          'author': {'type': 'keyword'},
          'message': {'type': 'text', 'fielddata': True, 'analyzer': 'english_noStemm'},
          'label': {'type': 'keyword'},
          'timestamp': {'type': 'date'}
        }
      }
    }
  }
)

def labelProcess(p_msg) :
  l_parse = p_msg.split( )
  for c_word in l_parse :
    if c_word.lower() == "fix" :
      return "bug fix"
    elif (c_word.lower() == "docs") | (c_word.lower() == "doc") | (c_word.lower() == "documentation") | (c_word.lower() == "[docs]") | (c_word.lower() == "[doc]") :
      return "documentation"
    elif (c_word.lower() == "tests") | (c_word.lower() == "test") | (c_word.lower() == "[test]") | (c_word.lower() == "[tests]") :
      return "test"
    elif (c_word.lower() == "merge") :
      return "merge"
  return "other"

def bulkInject(p_es, p_docs):
  l_result = p_es.bulk(p_docs)
  if l_result['errors']:
    #print("*** Error: bulk", l_result, file=sys.stderr)
    sys.exit(1)

# git log --pretty="%an:$:%ad:$:%s" --date=short
with codecs.open('./git_log.txt', mode="r", encoding="utf-8") as c_fp:
  l_id = 1
  l_docs = []
  for c_line in c_fp:
    l_line = c_line.split(":$:")
    l_date = datetime.strptime(l_line[1], "%Y-%m-%d")
    l_doc = {
      'author': l_line[0],
      'message': l_line[2],
      'label': labelProcess(l_line[2]),
      'timestamp': l_date
    }
    l_docs.append({ "index" : {
      "_index": "git-log1",
      "_type": "git_log",
      "_id": l_id
    } })
    l_docs.append(l_doc)

    if 1000 > len(l_docs):
      bulkInject(es, l_docs)
      l_docs = []
    l_id += 1
  
  if 0 < len(l_docs):
    bulkInject(es, l_docs)
    l_docs = []

